From 0f3fc73c4df1158760d8959d7d66d3eb1265fd9e Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 31 Jan 2022 15:52:52 -0800 Subject: [PATCH 001/247] accl: Adding src code for PushEngine. --- src/accl/push_engine.hh | 69 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 src/accl/push_engine.hh diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh new file mode 100644 index 0000000000..eda9d7b707 --- /dev/null +++ b/src/accl/push_engine.hh @@ -0,0 +1,69 @@ +#ifndef __ACCL_PUSH_ENGINE_HH__ +#define __ACCL_PUSH_ENGINE_HH__ + +#include +#include + +#include "base/addr_range_map.hh" +#include "base/statistics.hh" +#include "mem/port.hh" +#include "mem/packet.hh" +#include "params/PushEngine.hh" +#include "sim/clocked_object.hh" + +class PushEngine : public ClockedObject +{ + private: + + class PushRespPort : public ResponsePort + { + private: + bool _blocked; + PacketPtr blockedPacket; + + public: + PushRespPort(const std::string& name, SimObject* _owner, + PortID id=InvalidPortID); + + virtual AddrRangeList getAddrRanges(); + virtual bool recvTimingReq(PacketPtr pkt); + } + + class PushReqPort : public RequestPort + { + private: + bool _blocked; + PacketPtr blockedPacket; + + public: + PushReqPort(const std::string& name, SimObject* _owner, + PortID id=InvalidPortID); + + virtual bool recvTimingResp(PacketPtr pkt); + } + + class PushMemPort : public RequestPort + { + private: + bool _blocked; + PacketPtr blockedPacket; + + public: + PushMemPort(const std::string& name, SimObject* _owner, + PortID id=InvalidPortID); + bool sendPacket(PacktPtr pkt); + virtual bool recvTimingResp(PacketPtr pkt); + } + + PushRespPort respPort; + PushReqPort reqPort; + PushMemPort memPort; + + std::queue vertexQueue; + std::queue updateQueue; + + std::pair interpretPackPtr(PacketPtr pkt); + +}; + +#endif // __ACCL_PUSH_ENGINE_HH__ From 0dd0beb81d3910a313bb97c0c0dd1489e9f567ae Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 7 Feb 2022 17:56:49 -0800 Subject: [PATCH 002/247] Adding implementation for PushEngine (wip). --- src/accl/push_engine.cc | 120 ++++++++++++++++++++++++++++++++++++++++ src/accl/push_engine.hh | 63 ++++++++++++++++++++- src/accl/util.cc | 16 ++++++ src/accl/util.hh | 4 ++ 4 files changed, 200 insertions(+), 3 deletions(-) create mode 100644 src/accl/push_engine.cc create mode 100644 src/accl/util.cc create mode 100644 src/accl/util.hh diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc new file mode 100644 index 0000000000..bc3138f61e --- /dev/null +++ b/src/accl/push_engine.cc @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/push_engine.hh" + +#include "debug/PushEngine.hh" + +PushEngine::PushEngine(const PushEngineParams& params): + ClockedObject(params), + system(params.system), + requestorId(system->getRequestorId(this)), + reqPort(name() + ".reqPort", this), + respPort(name() + ".respPort", this), + memPort(name() + ".memPort", this), + vertexQueueSize(params.vertex_queue_size), + vertexQueueLen(0), + updateQueue(params.update_queue_size), + updateQueueLen(0), + nextReceiveEvent([this]{ processNextReceiveEvent(); }, name()), + nextReadEvent([this]{ processNextReadEvent(); }, name()), + nextCreateEvent([this]{ processNextCreateEvent(); }, name()), + nextSendEvent([this]{ processNextSendEvent(); }, name()) +{} + +Port & +PushEngine::getPort(const std::string &if_name, PortID idx) +{ + if (if_name == "reqPort") { + return reqPort; + } else if (if_name == "respPort") { + return respPort; + } else if (if_name == "memPort") { + return memPort; + } else { + return SimObject::getPort(if_name, idx); + } +} + +bool +PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt) +{ + return owner->handleUpdate(pkt); +} + +bool +PushEngine::handleUpdate(PacketPtr pkt) +{ + if (vertexQueueLen < vertexQueueSize) { + vertexQueue.push(pkt) + vertexQueueLen++; + return true; + + if (!nextReceiveEvent.scheduled()){ + schedule(nextReceiveEvent, nextCycle()); + } + } + return false; +} + +void +PushEngine::processNextReceiveEvent() +{ + PacketPtr updatePkt = vertexQueue.pop(); + uint8_t* data = updatePkt->getData(); + + Addr edgeListAddr = ; // TODO: Generalize finding this address. + int outDegree = ; // TODO: Generalize finding this value. + + Addr reqAddr = (edgeListAddr / 64) * 64; + Addr offsetAddr = edgeListAddr % 64; + + PacketPtr pkt = getReadPacket(reqAddr, 64, requestorId); + + memPort.sendPacket(pkt); + + +} + +void +PushEngine::processNextReadEvent() +{ + +} + +void +PushEngine::processNextCreateEvent() +{ + +} + +void +PushEngine::processNextSendEvent() +{ + +} \ No newline at end of file diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh index eda9d7b707..6ab902d0e2 100644 --- a/src/accl/push_engine.hh +++ b/src/accl/push_engine.hh @@ -1,8 +1,35 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + #ifndef __ACCL_PUSH_ENGINE_HH__ #define __ACCL_PUSH_ENGINE_HH__ #include -#include #include "base/addr_range_map.hh" #include "base/statistics.hh" @@ -10,6 +37,7 @@ #include "mem/packet.hh" #include "params/PushEngine.hh" #include "sim/clocked_object.hh" +#include "sim/system.hh" class PushEngine : public ClockedObject { @@ -18,6 +46,7 @@ class PushEngine : public ClockedObject class PushRespPort : public ResponsePort { private: + PushEngine* owner; bool _blocked; PacketPtr blockedPacket; @@ -55,14 +84,42 @@ class PushEngine : public ClockedObject virtual bool recvTimingResp(PacketPtr pkt); } - PushRespPort respPort; + System* const system; + const RequestorID requestorId; + PushReqPort reqPort; + PushRespPort respPort; + PushMemPort memPort; std::queue vertexQueue; + int vertexQueueSize; + int vertexQueueLen; + std::queue updateQueue; + int updateQueueSize; + int updateQueueLen; + + EventFunctionWrapper nextReceiveEvent; + void processNextReceiveEvent(); + + EventFunctionWrapper nextReadEvent; + void processNextReadEvent(); + + EventFunctionWrapper nextCreateEvent; + void processNextCreateEvent(); + + EventFunctionWrapper nextSendEvent; + void processNextSendEvent(); + + bool handleUpdate(PacketPtr pkt); + + public: + + PushEngine(const PushEngineParams ¶ms); - std::pair interpretPackPtr(PacketPtr pkt); + Port &getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; }; diff --git a/src/accl/util.cc b/src/accl/util.cc new file mode 100644 index 0000000000..20abd1c13a --- /dev/null +++ b/src/accl/util.cc @@ -0,0 +1,16 @@ +#include "accl/util.hh" + +PacketPtr +getReadPacket(Addr addr, unsigned int size, RequestorID requestorId) +{ + RequestPtr req = std::make_shared(addr, size, 0, requestorId); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr)requestorId) << 2); + + // Embed it in a packet + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + pkt->allocate(); + + return pkt; +} diff --git a/src/accl/util.hh b/src/accl/util.hh new file mode 100644 index 0000000000..c621b9e45c --- /dev/null +++ b/src/accl/util.hh @@ -0,0 +1,4 @@ +#include "mem/packet.hh" + +PacketPtr getReadPacket(Addr addr, unsigned int size); + From 3b359ade313c989b465a5879d738096526cbf6c4 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 13 Feb 2022 13:36:08 -0800 Subject: [PATCH 003/247] Adding util source code. --- src/accl/util.cc | 28 ++++++++++++++++++++++++++++ src/accl/util.hh | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/src/accl/util.cc b/src/accl/util.cc index 20abd1c13a..8d975c482f 100644 --- a/src/accl/util.cc +++ b/src/accl/util.cc @@ -1,3 +1,31 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + #include "accl/util.hh" PacketPtr diff --git a/src/accl/util.hh b/src/accl/util.hh index c621b9e45c..18b8e4c197 100644 --- a/src/accl/util.hh +++ b/src/accl/util.hh @@ -1,4 +1,50 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "base/types.hh" #include "mem/packet.hh" -PacketPtr getReadPacket(Addr addr, unsigned int size); +struct WorkListItem +{ + uint32_t temp_prop; + uint32_t prop; + uint32_t degree; + Addr edgeList; +} + +struct Edge +{ + uint32_t weight; + Addr neighbor; +} + +WorkListItem& memoryToWorkList(uint8_t* data); +Edge& memoryToEdge(uint8_t* data); +PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId); +PacketPtr getWritePacket(Addr addr, unsigned int size, uint8_t* data, RequestorID requestorId); From f74e9df55bafd83ea180ad6b9db91840f0e3b9e5 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Mon, 31 Jan 2022 11:34:07 -0800 Subject: [PATCH 004/247] Adding the first version of Apply engine --- src/accl/apply.cc | 129 ++++++++++++++++++++++++++++++++++++++++++++ src/accl/apply.hh | 132 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 261 insertions(+) create mode 100644 src/accl/apply.cc create mode 100644 src/accl/apply.hh diff --git a/src/accl/apply.cc b/src/accl/apply.cc new file mode 100644 index 0000000000..d0e2b712a6 --- /dev/null +++ b/src/accl/apply.cc @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/apply.h" + +#include + + +typedef std::pair ReqPair; +typedef std::pair QueuePair; + +Apply::Apply(const ApplyParams ¶ms): + ClockedObject(params), + nextApplyEvent([this]{processNextApplyEvent; }, name()), + nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name()), + queueSize(params.applyQueueSize) //add this to .py +{ + applyReadQueue(queueSize); + pplyWriteQueue(queueSize); +} + +bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt) +{ + if (!owner->handleWL(pkt)){ + return false; + } + return true; +} + +bool Apply::handleWL(PacketPtr pkt){ + auto queue = applyReadQueue; + if (queue->blocked()){ + sendPktRetry = true; + return false; + } else + queue->push(pkt); + + if(!nextApplyCheckEvent.scheduled()){ + schedule(nextApplyCheckEvent, nextCycle()); + } + return true; +} + + +void Apply::processNextApplyCheckEvent(){ + auto queue = applyReadQueue; + memPort = ApplyMemPort + while(!queue.empty()){ + auto pkt = queue.pop() + /// conver to ReadReq + bool ret = memPort->sendPacket(pkt); + // handel responsehere + if (!ret) + break; + } + +} + +virtual bool +Apply::MPUMemPort::recvTimingResp(PacketPtr pkt) +{ + return owner->handleMemResp(pkt); +} + +bool +Apply::handleMemResp(PacktPtr pkt) +{ + auto queue = applyWriteQueue; + //check pkt (temp_prop != prop) + if (temp_prop != prop){ + //update prop with temp_prop + if (queue->blocked()){ + sendPktRetry = true; + return false; + } else + queue->push(pkt); + + if(!nextApplyEvent.scheduled()){ + schedule(nextApplyEvent, nextCycle()); + } + return true; + } + return true; +} + + + +void +Apply::processNextApplyEvent(){ + auto queue = applyWriteQueue; + memPort = ApplyMemPort; + pushPort = ApplyReqPort; + while(!queue.empty()){ + auto pkt = queue.pop() + /// conver to ReadReq + bool ret = memPort->sendPacket(pkt); + bool push = pushPort->sendPacket(pkt); + // handel responsehere + if (!ret || !push) + break; + + } + +} \ No newline at end of file diff --git a/src/accl/apply.hh b/src/accl/apply.hh new file mode 100644 index 0000000000..2ae593a1cb --- /dev/null +++ b/src/accl/apply.hh @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_APPLY_HH__ +#define __ACCL_APPLY_HH__ + +#include +#include + +#include "base/addr_range_map.hh" +#include "base/statistics.hh" +#include "mem/port.hh" +#include "mem/packet.hh" +#include "params/MPU.hh" +#include "sim/clocked_object.hh" + +class Apply : public ClockedObject +{ + private: + + class ApplyRespPort : public ResponsePort + { + private: + Apply *owner; + bool _blocked; + PacketPtr blockedPacket; + + public: + ApplyRespPort(const std::string& name, SimObject* _owner, + PortID id=InvalidPortID); + + virtual AddrRangeList getAddrRanges(); + virtual bool recvTimingReq(PacketPtr pkt); + } + + class ApplyReqPort : public RequestPort + { + private: + APPLY *owner; + bool _blocked; + PacketPtr blockedPacket; + + struct ApplyQueue{ + std::queue applyQueue; + const uint_32 queueSize; + bool sendPktRetry; + + bool blocked(){ + return applyQueue.size() == queueSize; + } + bool empty(){ + return applyQueue.empty(); + } + void push(PacketPtr pkt){ + applyQueue.push(pkt); + } + + ApplyQueue(uint32_t qSize): + queueSize(qSize){} + }; + public: + ApplyReqPort(const std::string& name, SimObject* _owner, + PortID id=InvalidPortID); + + virtual bool recvTimingResp(PacketPtr pkt); + } + + class ApplyMemPort : public RequestPort + { + private: + Apply *owner; + bool _blocked; + PacketPtr blockedPacket; + public: + ApplyReqPort(const std::string& name, SimObject* _owner, + PortID id=InvalidPortID); + bool sendPacket(PacktPtr pkt); + virtual bool recvTimingResp(PacketPtr pkt); + + } + bool handleWL(PacketPtr pkt); + bool sendPacket(); + //one queue for write and one for read a priotizes write over read + void readApplyBuffer(); + bool handleMemResp(PacktPtr resp); + void writePushBuffer(); + + + //Events + void processNextApplyCheckEvent(); + /* Syncronously checked + If there are any active vertecies: + create memory read packets + MPU::MPU::MemPortsendTimingReq + */ + void processNextApplyEvent(); + /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp + Perform apply and send the write request and read edgeList + read + write + Write edgelist loc in buffer + */ + + ApplyQueue applyQueue; + ApplyMemPort memPort; + public(const ApplyParams &apply); +}; + +#endif // __ACCL_APPLY_HH__ \ No newline at end of file From 7945cf333644c9ad0f0e5dfb99e8040d3944785d Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Sat, 5 Feb 2022 20:34:12 -0800 Subject: [PATCH 005/247] Portotyping memory interface --- src/accl/apply.cc | 36 ++++++++++++++++++++++-------------- src/accl/apply.hh | 8 +++++--- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/src/accl/apply.cc b/src/accl/apply.cc index d0e2b712a6..b0ef5e8513 100644 --- a/src/accl/apply.cc +++ b/src/accl/apply.cc @@ -46,7 +46,7 @@ Apply::Apply(const ApplyParams ¶ms): bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt) { - if (!owner->handleWL(pkt)){ + if (!this->handleWL(pkt)){ return false; } return true; @@ -73,7 +73,9 @@ void Apply::processNextApplyCheckEvent(){ while(!queue.empty()){ auto pkt = queue.pop() /// conver to ReadReq - bool ret = memPort->sendPacket(pkt); + RequestPtr req = std::make_shared(pkt->getAddr(), 64, 0 ,0); + PacketPtr memPkt = new Packet(req, MemCmd::ReadReq); + bool ret = memPort->sendPacket(memPkt); // handel responsehere if (!ret) break; @@ -84,27 +86,24 @@ void Apply::processNextApplyCheckEvent(){ virtual bool Apply::MPUMemPort::recvTimingResp(PacketPtr pkt) { - return owner->handleMemResp(pkt); + return this->handleMemResp(pkt); } bool Apply::handleMemResp(PacktPtr pkt) { auto queue = applyWriteQueue; - //check pkt (temp_prop != prop) - if (temp_prop != prop){ - //update prop with temp_prop + if (queue->blocked()){ sendPktRetry = true; return false; } else - queue->push(pkt); + queue->push(writePkt); if(!nextApplyEvent.scheduled()){ schedule(nextApplyEvent, nextCycle()); } return true; - } return true; } @@ -117,12 +116,21 @@ Apply::processNextApplyEvent(){ pushPort = ApplyReqPort; while(!queue.empty()){ auto pkt = queue.pop() - /// conver to ReadReq - bool ret = memPort->sendPacket(pkt); - bool push = pushPort->sendPacket(pkt); - // handel responsehere - if (!ret || !push) - break; + uint64_t* data = pkt->getPtr(); + uint32_t* prop = data; + uint32_t* temp_prop = prop + 1; + if (*temp_prop != *prop){ + //update prop with temp_prop + *prop = min(*prop , *temp_prop); + RequestPtr req = std::make_shared(pkt->getAddr(), 64, 0 ,0); + PacketPtr writePkt = new Packet(req, MemCmd::WriteReq); + writePkt->setData(data); + bool ret = memPort->sendPacket(pkt); + bool push = pushPort->sendPacket(pkt); + // handel response here + if (!ret || !push) + break; + } } diff --git a/src/accl/apply.hh b/src/accl/apply.hh index 2ae593a1cb..e9c27a1fcf 100644 --- a/src/accl/apply.hh +++ b/src/accl/apply.hh @@ -61,7 +61,7 @@ class Apply : public ClockedObject class ApplyReqPort : public RequestPort { private: - APPLY *owner; + Apply *owner; bool _blocked; PacketPtr blockedPacket; @@ -124,9 +124,11 @@ class Apply : public ClockedObject Write edgelist loc in buffer */ - ApplyQueue applyQueue; + ApplyQueue applyReadQueue; + ApplyQueue applyWriteQueue; ApplyMemPort memPort; - public(const ApplyParams &apply); + std::pair + public(const ApplyParams &apply); //fix this }; #endif // __ACCL_APPLY_HH__ \ No newline at end of file From 14426cddc9527e56cf96cb15d7382199e4309e98 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Fri, 11 Feb 2022 12:04:02 -0800 Subject: [PATCH 006/247] [wip] Improving the implementation. Adding address range, python params. --- src/accl/Apply.py | 39 ++++++++++++ src/accl/apply.cc | 153 +++++++++++++++++++++++++++++++++++----------- src/accl/apply.hh | 42 ++++++++++--- 3 files changed, 191 insertions(+), 43 deletions(-) create mode 100644 src/accl/Apply.py diff --git a/src/accl/Apply.py b/src/accl/Apply.py new file mode 100644 index 0000000000..01c627d4c8 --- /dev/null +++ b/src/accl/Apply.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.SimObject import SimObject +from m5.objects.ClockedObject import ClockedObject + +class Apply(ClockedObject): + type = 'Apply' + cxx_header = "accl/apply.hh" + cxx_class = 'gem5::Apply' + + respPort = ResponsePort("Receives requests from WorkList") + reqPort = RequestPort("Sends requests to Push") + memPort = RequestPort("Memory side port, sends requests") diff --git a/src/accl/apply.cc b/src/accl/apply.cc index b0ef5e8513..d605537033 100644 --- a/src/accl/apply.cc +++ b/src/accl/apply.cc @@ -26,22 +26,41 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "accl/apply.h" +#include "accl/apply.hh" #include - -typedef std::pair ReqPair; -typedef std::pair QueuePair; - Apply::Apply(const ApplyParams ¶ms): ClockedObject(params), + reqPort(name() + ".reqPort", this), + respPort(name() + ".respPort", this), + memPort(name() + ".memPort", this), nextApplyEvent([this]{processNextApplyEvent; }, name()), nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name()), queueSize(params.applyQueueSize) //add this to .py { applyReadQueue(queueSize); - pplyWriteQueue(queueSize); + applyWriteQueue(queueSize); +} + +Port & +Apply::getPort(const std::string &if_name, PortID idx) +{ + if (if_name == "reqPort") { + return reqPort; + } else if (if_name == "respPort") { + return respPort; + } else if (if_name == "memPort") { + return memPort; + } else { + return SimObject::getPort(if_name, idx); + } +} + +AddrRangeList +Apply::ApplyRespPort::getAddrRanges() const +{ + return owner->getAddrRanges(); } bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt) @@ -52,6 +71,65 @@ bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt) return true; } +void +Apply::ApplyRespPort::trySendRetry() +{ + sendRetryReq(); +} + + +virtual bool +Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt) +{ + return this->handleMemResp(pkt); +} + +void +WLEngine::ApplyMemPort::sendPacket(PacketPtr pkt) +{ + if (!sendTimingReq(pkt)) { + blockedPacket = pkt; + _blocked = true; + } +} + +void +Apply::ApplyMemPort::trySendRetry() +{ + sendRetryReq(); +} + +void +Apply::ApplyMemPort::recvReqRetry() +{ + _blocked = false; + sendPacket(blockedPacket); + blockedPacket = nullptr; +} + +void +WLEngine::ApplyRequestPort::sendPacket(PacketPtr pkt) +{ + if (!sendTimingReq(pkt)) { + blockedPacket = pkt; + _blocked = true; + } +} + +void +Apply::ApplyRequestPort::recvReqRetry() +{ + _blocked = false; + sendPacket(blockedPacket); + blockedPacket = nullptr; +} + +AddrRangeList +Apply::getAddrRanges() const +{ + return memPort.getAddrRanges(); +} + bool Apply::handleWL(PacketPtr pkt){ auto queue = applyReadQueue; if (queue->blocked()){ @@ -59,34 +137,29 @@ bool Apply::handleWL(PacketPtr pkt){ return false; } else queue->push(pkt); - if(!nextApplyCheckEvent.scheduled()){ schedule(nextApplyCheckEvent, nextCycle()); } return true; } - void Apply::processNextApplyCheckEvent(){ auto queue = applyReadQueue; - memPort = ApplyMemPort while(!queue.empty()){ - auto pkt = queue.pop() - /// conver to ReadReq - RequestPtr req = std::make_shared(pkt->getAddr(), 64, 0 ,0); - PacketPtr memPkt = new Packet(req, MemCmd::ReadReq); - bool ret = memPort->sendPacket(memPkt); - // handel responsehere - if (!ret) - break; + if(!memPort->blocked()){ + auto pkt = queue.pop(); + if(queue->sendPktRetry && !queue->blocked()){ + respPort->trySendRetry(); + queue->sendPktRetry = false; + } + // conver to ReadReq + RequestPtr req = std::make_shared(pkt->getAddr(), 64, 0 ,0); + PacketPtr memPkt = new Packet(req, MemCmd::ReadReq); + memPort->sendPacket(memPkt); + } + else + return; } - -} - -virtual bool -Apply::MPUMemPort::recvTimingResp(PacketPtr pkt) -{ - return this->handleMemResp(pkt); } bool @@ -107,31 +180,39 @@ Apply::handleMemResp(PacktPtr pkt) return true; } - - void Apply::processNextApplyEvent(){ auto queue = applyWriteQueue; - memPort = ApplyMemPort; - pushPort = ApplyReqPort; while(!queue.empty()){ - auto pkt = queue.pop() + auto pkt = queue.front(); uint64_t* data = pkt->getPtr(); uint32_t* prop = data; uint32_t* temp_prop = prop + 1; if (*temp_prop != *prop){ //update prop with temp_prop *prop = min(*prop , *temp_prop); - RequestPtr req = std::make_shared(pkt->getAddr(), 64, 0 ,0); + RequestPtr req = + std::make_shared(pkt->getAddr(), 64, 0 ,0); PacketPtr writePkt = new Packet(req, MemCmd::WriteReq); writePkt->setData(data); - bool ret = memPort->sendPacket(pkt); - bool push = pushPort->sendPacket(pkt); - // handel response here - if (!ret || !push) + if (!memPort->blocked() && !reqPort->blocked()){ //re-think this + memPort->sendPacket(pkt); + applyReqPort->sendPacket(pkt); + queue.pop(); + if(queue->sendPktRetry && !queue->blocked()){ + memPort->trySendRetry(); + queue->sendPktRetry = false; + } + } + else break; } - + else{ + queue.pop(); + if(queue->sendPktRetry && !queue->blocked()){ + memPort->trySendRetry(); + queue->sendPktRetry = false; + } + } } - } \ No newline at end of file diff --git a/src/accl/apply.hh b/src/accl/apply.hh index e9c27a1fcf..fab4cf871a 100644 --- a/src/accl/apply.hh +++ b/src/accl/apply.hh @@ -56,6 +56,7 @@ class Apply : public ClockedObject virtual AddrRangeList getAddrRanges(); virtual bool recvTimingReq(PacketPtr pkt); + void trySendRetry(); } class ApplyReqPort : public RequestPort @@ -64,7 +65,6 @@ class Apply : public ClockedObject Apply *owner; bool _blocked; PacketPtr blockedPacket; - struct ApplyQueue{ std::queue applyQueue; const uint_32 queueSize; @@ -83,12 +83,19 @@ class Apply : public ClockedObject ApplyQueue(uint32_t qSize): queueSize(qSize){} }; + public: ApplyReqPort(const std::string& name, SimObject* _owner, PortID id=InvalidPortID); + void sendPacket(PacketPtr pkt); + bool blocked(){ + return _blocked; + } + protected: + void recvReqRetry() override; virtual bool recvTimingResp(PacketPtr pkt); - } + }; class ApplyMemPort : public RequestPort { @@ -96,13 +103,21 @@ class Apply : public ClockedObject Apply *owner; bool _blocked; PacketPtr blockedPacket; + public: ApplyReqPort(const std::string& name, SimObject* _owner, PortID id=InvalidPortID); - bool sendPacket(PacktPtr pkt); + void sendPacket(PacketPtr pkt); + void trySendRetry(); + bool blocked(){ + return _blocked; + } + + protected: virtual bool recvTimingResp(PacketPtr pkt); + void recvReqRetry() override; + }; - } bool handleWL(PacketPtr pkt); bool sendPacket(); //one queue for write and one for read a priotizes write over read @@ -110,7 +125,6 @@ class Apply : public ClockedObject bool handleMemResp(PacktPtr resp); void writePushBuffer(); - //Events void processNextApplyCheckEvent(); /* Syncronously checked @@ -124,11 +138,25 @@ class Apply : public ClockedObject Write edgelist loc in buffer */ + void processNextApplyEvent(); + EventFunctionWrapper nextApplyEvent; + + void processNextApplyCheckEvent(); + EventFunctionWrapper nextApplyCheckEvent; + + AddrRangeList getAddrRanges() const; + ApplyQueue applyReadQueue; ApplyQueue applyWriteQueue; + ApplyMemPort memPort; - std::pair - public(const ApplyParams &apply); //fix this + ApplyRespPort respPort; + ApplyRequestPort reqPort; + + public: + Apply(const ApplyParams &apply); + Port &getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; }; #endif // __ACCL_APPLY_HH__ \ No newline at end of file From 8e79d19e2028a80dda8aa7b2026a010310fec300 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Fri, 11 Feb 2022 13:14:27 -0800 Subject: [PATCH 007/247] [wip] minor fixes to Apply engine --- src/accl/apply.cc | 8 ++++---- src/accl/apply.hh | 44 +++++++++++++++++++++++--------------------- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/src/accl/apply.cc b/src/accl/apply.cc index d605537033..6ad630f0ac 100644 --- a/src/accl/apply.cc +++ b/src/accl/apply.cc @@ -96,7 +96,7 @@ WLEngine::ApplyMemPort::sendPacket(PacketPtr pkt) void Apply::ApplyMemPort::trySendRetry() { - sendRetryReq(); + sendRetryResp(); } void @@ -108,7 +108,7 @@ Apply::ApplyMemPort::recvReqRetry() } void -WLEngine::ApplyRequestPort::sendPacket(PacketPtr pkt) +WLEngine::ApplyReqPort::sendPacket(PacketPtr pkt) { if (!sendTimingReq(pkt)) { blockedPacket = pkt; @@ -117,7 +117,7 @@ WLEngine::ApplyRequestPort::sendPacket(PacketPtr pkt) } void -Apply::ApplyRequestPort::recvReqRetry() +Apply::ApplyReqtPort::recvReqRetry() { _blocked = false; sendPacket(blockedPacket); @@ -158,7 +158,7 @@ void Apply::processNextApplyCheckEvent(){ memPort->sendPacket(memPkt); } else - return; + break; } } diff --git a/src/accl/apply.hh b/src/accl/apply.hh index fab4cf871a..dae3d8ec0e 100644 --- a/src/accl/apply.hh +++ b/src/accl/apply.hh @@ -43,11 +43,29 @@ class Apply : public ClockedObject { private: + struct ApplyQueue{ + std::queue applyQueue; + const uint_32 queueSize; + bool sendPktRetry; + + bool blocked(){ + return applyQueue.size() == queueSize; + } + bool empty(){ + return applyQueue.empty(); + } + void push(PacketPtr pkt){ + applyQueue.push(pkt); + } + + ApplyQueue(uint32_t qSize): + queueSize(qSize){} + }; + class ApplyRespPort : public ResponsePort { private: Apply *owner; - bool _blocked; PacketPtr blockedPacket; public: @@ -55,9 +73,11 @@ class Apply : public ClockedObject PortID id=InvalidPortID); virtual AddrRangeList getAddrRanges(); - virtual bool recvTimingReq(PacketPtr pkt); void trySendRetry(); - } + + protected: + virtual bool recvTimingReq(PacketPtr pkt); + }; class ApplyReqPort : public RequestPort { @@ -65,24 +85,6 @@ class Apply : public ClockedObject Apply *owner; bool _blocked; PacketPtr blockedPacket; - struct ApplyQueue{ - std::queue applyQueue; - const uint_32 queueSize; - bool sendPktRetry; - - bool blocked(){ - return applyQueue.size() == queueSize; - } - bool empty(){ - return applyQueue.empty(); - } - void push(PacketPtr pkt){ - applyQueue.push(pkt); - } - - ApplyQueue(uint32_t qSize): - queueSize(qSize){} - }; public: ApplyReqPort(const std::string& name, SimObject* _owner, From 469a8f7f7897289d5295500f18e7a60e691123d0 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Mon, 7 Feb 2022 12:26:01 -0800 Subject: [PATCH 008/247] Worklist engine implementation --- src/accl/wl_engine.cc | 185 ++++++++++++++++++++++++++++++++++++++++++ src/accl/wl_engine.hh | 143 ++++++++++++++++++++++++++++++++ 2 files changed, 328 insertions(+) create mode 100644 src/accl/wl_engine.cc create mode 100644 src/accl/wl_engine.hh diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc new file mode 100644 index 0000000000..28f8a4fe11 --- /dev/null +++ b/src/accl/wl_engine.cc @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/wl_engine.hh" + +#include + + +WLEngine::WLEngine(const WLEngineParams ¶ms): + ClockedObject(params), + nextWLReadEvent([this]{processNextWLReadEvent; }, name()), + nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()), + queueSize(params.wlQueueSize) //add this to .py +{ + wlReadQueue(queueSize); + wlWriteQueue(queueSize); +} + +bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt) +{ + if (!this->handleWLUpdate(pkt)){ + return false; + } + return true; +} + +bool WLEngine::handleWLUpdate(PacketPtr pkt){ + auto queue = wlReadQueue; + if (queue->blocked()){ + queue->sendPktRetry = true; + return false; + } else + queue->push(pkt); + + if(!nextWLReadEvent.scheduled()){ + schedule(nextWLReadEvent, nextCycle()); + } + return true; +} + + +void WLEngine::processNextWLReadEvent(){ + auto queue = wlReadQueue; + memPort = WLMemPort + while(!queue.empty()){ //create a map instead of front + auto pkt = queue.front() + /// conver to ReadReq + RequestPtr req = std::make_shared(pkt->getAddr(), 64, 0 ,0); + PacketPtr memPkt = new Packet(req, MemCmd::ReadReq); + if (!memPort->blocked()){ + memPort->sendPacket(memPkt); + break; + } + } + +} + +void +WLEngine::WLMemPort::sendPacket(PacketPtr pkt) +{ + if (!sendTimingReq(pkt)) { + blockedPacket = pkt; + _blocked = true; + } +} + +void +WLEngine::WLMemPort::recvReqRetry() +{ + // We should have a blocked packet if this function is called. + assert(_blocked && blockedPacket != nullptr); + _blocked = false; + sendPacket(blockedPacket); + blockedPacket = nullptr; + + owner->wakeUp(); //TODO +} + +virtual bool +WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt) +{ + return this->handleMemResp(pkt); +} + +bool +WLEngine::handleMemResp(PacktPtr pkt) +{ + auto queue = applyWriteQueue; + if (queue->blocked()){ + sendPktRetry = true; + return false; + } else + queue->push(writePkt); + + if(!nextWLReduceEvent.scheduled()){ + schedule(nextWLReduceEvent, nextCycle()); + } + return true; + return true; +} + +void +WLEngine::processNextWLReduceEvent(){ + auto queue = wlWriteQueue; + auto updateQ = wlReadQueue; + memPort = WLMemPort; + applyPort = WLReqPort; + while(!queue.empty()){ + auto update = updateQ.pop() + if (!updateQ->blocked() & updateQ->sendPktRetry){ + WLRespPort->trySendRetry(); + updateQ->sendPktRetry = false; + } + auto pkt = queue.front() + uint64_t* updatePtr = pkt->getPtr(); + uint64_t* data = pkt->getPtr(); + uint32_t* value = updatePtr; + uint32_t* temp_prop = prop + 1; + if (*value != *prop){ + //update prop with temp_prop + *temp_prop = min(*value , *temp_prop); + RequestPtr req = std::make_shared(pkt->getAddr(), 64, 0 ,0); + PacketPtr writePkt = new Packet(req, MemCmd::WriteReq); + writePkt->setData(data); + if (!memPort->blocked() && !applyPort->blocked()){ + memPort->sendPacket(pkt); + applyPort->sendPacket(pkt); + queue.pop(); + if (!queue->blocked() && queue->sendPktRetry){ + memPort->trySendRetry(); + queue->sendPktRetry = false; + } + } + else + break; + } + else{ + queue.pop(); + if (!queue->blocked() && queue->sendPktRetry){ + memPort->trySendRetry(); + queue->sendPktRetry = false; + } + + } + + } + +} + +void +WLEngine::WLRespPort::trySendRetry() +{ + sendRetryReq(); +} + +void +WLEngine::WLMemPort::trySendRetry() +{ + sendRetryResp(); +} \ No newline at end of file diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh new file mode 100644 index 0000000000..7269965ff2 --- /dev/null +++ b/src/accl/wl_engine.hh @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_WLE_HH__ +#define __ACCL_WLE_HH__ + +#include +#include + +#include "base/addr_range_map.hh" +#include "base/statistics.hh" +#include "mem/port.hh" +#include "mem/packet.hh" +#include "params/MPU.hh" +#include "sim/clocked_object.hh" + +class WLEngine : public ClockedObject +{ + private: + + struct WLQueue{ + std::queue wlQueue; + const uint_32 queueSize; + bool sendPktRetry; + + bool blocked(){ + return wlQueue.size() == queueSize; + } + bool empty(){ + return wlQueue.empty(); + } + void push(PacketPtr pkt){ + wlQueue.push(pkt); + } + + WLReqPort(uint32_t qSize): + queueSize(qSize){} + }; + + class WLRespPort : public ResponsePort //From Push engine + { + private: + WLEngine *owner; + bool _blocked; + PacketPtr blockedPacket; + + public: + WLRespPort(const std::string& name, SimObject* _owner, + PortID id=InvalidPortID); + + virtual AddrRangeList getAddrRanges(); + virtual bool recvTimingReq(PacketPtr pkt); + bool blocked(){ + return _blocked; + } + } + + class WLReqPort : public RequestPort //To Apply Engine + { + private: + WLEngine *owner; + bool _blocked; + PacketPtr blockedPacket; + public: + WLReqPort(const std::string& name, SimObject* _owner, + PortID id=InvalidPortID); + void trySendRetry(); + virtual bool recvTimingResp(PacketPtr pkt); + bool blocked(){ + return _blocked; + } + } + + class WLMemPort : public RequestPort + { + private: + WLEngine *owner; + bool _blocked; + PacketPtr blockedPacket; + public: + WLMemPort(const std::string& name, SimObject* _owner, + PortID id=InvalidPortID); + void sendPacket(PacktPtr pkt); + virtual bool recvTimingResp(PacketPtr pkt); + void trySendRetry(); + bool blocked(){ + return _blocked; + } + } + bool handleWLU(PacketPtr pkt); + bool sendPacket(); + //one queue for write and one for read a priotizes write over read + void readWLBuffer(); + bool handleMemResp(PacktPtr resp); + + + //Events + void processNextWLReadEvent(); + /* Syncronously checked + If there are any active vertecies: + create memory read packets + MPU::MPU::MemPortsendTimingReq + */ + void processNextWLReduceEvent(); + /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp + Perform apply and send the write request and read edgeList + read + write + Write edgelist loc in buffer + */ + + WLQueue wlReadQueue; + WLQueue wlWriteQueue; + WLMemPort memPort; + std::pair + public: + WLEngine(const WLEngineParams ¶ms); //fix this +}; + +#endif // __ACCL_WLE_HH__ \ No newline at end of file From af73e980a6f14878b8ad77fc6c4d7a649f3d2bcd Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Fri, 11 Feb 2022 13:06:32 -0800 Subject: [PATCH 009/247] [wip] Adding the python file to the WLE --- src/accl/WLEngine.py | 39 ++++++++++++ src/accl/wl_engine.cc | 138 ++++++++++++++++++++++++++++-------------- src/accl/wl_engine.hh | 46 ++++++++++---- 3 files changed, 165 insertions(+), 58 deletions(-) create mode 100644 src/accl/WLEngine.py diff --git a/src/accl/WLEngine.py b/src/accl/WLEngine.py new file mode 100644 index 0000000000..fe6b25b6ba --- /dev/null +++ b/src/accl/WLEngine.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.SimObject import SimObject +from m5.objects.ClockedObject import ClockedObject + +class WLEngine(ClockedObject): + type = 'WLEngine' + cxx_header = "accl/wl_engine.hh" + cxx_class = 'gem5::WLEngine' + + respPort = ResponsePort("Receives updates") + reqPort = RequestPort("Sends requests to Apply") + memPort = RequestPort("Memory side port, sends requests") diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc index 28f8a4fe11..fbf201720d 100644 --- a/src/accl/wl_engine.cc +++ b/src/accl/wl_engine.cc @@ -33,6 +33,9 @@ WLEngine::WLEngine(const WLEngineParams ¶ms): ClockedObject(params), + reqPort(name() + ".reqPort", this), + respPort(name() + ".respPort", this), + memPort(name() + ".memPort", this), nextWLReadEvent([this]{processNextWLReadEvent; }, name()), nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()), queueSize(params.wlQueueSize) //add this to .py @@ -41,6 +44,26 @@ WLEngine::WLEngine(const WLEngineParams ¶ms): wlWriteQueue(queueSize); } +Port & +WLEngine::getPort(const std::string &if_name, PortID idx) +{ + if (if_name == "reqPort") { + return reqPort; + } else if (if_name == "respPort") { + return respPort; + } else if (if_name == "memPort") { + return memPort; + } else { + return SimObject::getPort(if_name, idx); + } +} + +AddrRangeList +WLEngine::WLRespPort::getAddrRanges() const +{ + return owner->getAddrRanges(); +} + bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt) { if (!this->handleWLUpdate(pkt)){ @@ -49,6 +72,68 @@ bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt) return true; } +void +WLEngine::WLRespPort::trySendRetry() +{ + sendRetryReq(); +} + +void +WLEngine::WLMemPort::sendPacket(PacketPtr pkt) +{ + if (!sendTimingReq(pkt)) { + blockedPacket = pkt; + _blocked = true; + } +} + +void +WLEngine::WLMemPort::recvReqRetry() +{ + // We should have a blocked packet if this function is called. + assert(_blocked && blockedPacket != nullptr); + _blocked = false; + sendPacket(blockedPacket); + blockedPacket = nullptr; +} + +virtual bool +WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt) +{ + return this->handleMemResp(pkt); +} + +void +WLEngine::WLMemPort::trySendRetry() +{ + sendRetryResp(); +} + +void +WLEngine::WLReqPort::recvReqRetry() +{ + // We should have a blocked packet if this function is called. + assert(_blocked && blockedPacket != nullptr); + _blocked = false; + sendPacket(blockedPacket); + blockedPacket = nullptr; +} + +void +WLEngine::WLReqPort::sendPacket(PacketPtr pkt) +{ + if (!sendTimingReq(pkt)) { + blockedPacket = pkt; + _blocked = true; + } +} + +AddrRangeList +WLEngine::getAddrRanges() const +{ + return memPort.getAddrRanges(); +} + bool WLEngine::handleWLUpdate(PacketPtr pkt){ auto queue = wlReadQueue; if (queue->blocked()){ @@ -63,14 +148,14 @@ bool WLEngine::handleWLUpdate(PacketPtr pkt){ return true; } - void WLEngine::processNextWLReadEvent(){ auto queue = wlReadQueue; memPort = WLMemPort while(!queue.empty()){ //create a map instead of front auto pkt = queue.front() /// conver to ReadReq - RequestPtr req = std::make_shared(pkt->getAddr(), 64, 0 ,0); + RequestPtr req = + std::make_shared(pkt->getAddr(), 64, 0 ,0); PacketPtr memPkt = new Packet(req, MemCmd::ReadReq); if (!memPort->blocked()){ memPort->sendPacket(memPkt); @@ -80,37 +165,10 @@ void WLEngine::processNextWLReadEvent(){ } -void -WLEngine::WLMemPort::sendPacket(PacketPtr pkt) -{ - if (!sendTimingReq(pkt)) { - blockedPacket = pkt; - _blocked = true; - } -} - -void -WLEngine::WLMemPort::recvReqRetry() -{ - // We should have a blocked packet if this function is called. - assert(_blocked && blockedPacket != nullptr); - _blocked = false; - sendPacket(blockedPacket); - blockedPacket = nullptr; - - owner->wakeUp(); //TODO -} - -virtual bool -WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt) -{ - return this->handleMemResp(pkt); -} - bool WLEngine::handleMemResp(PacktPtr pkt) { - auto queue = applyWriteQueue; + auto queue = wlWriteQueue; if (queue->blocked()){ sendPktRetry = true; return false; @@ -128,12 +186,11 @@ void WLEngine::processNextWLReduceEvent(){ auto queue = wlWriteQueue; auto updateQ = wlReadQueue; - memPort = WLMemPort; - applyPort = WLReqPort; + applyPort = reqPort; while(!queue.empty()){ auto update = updateQ.pop() if (!updateQ->blocked() & updateQ->sendPktRetry){ - WLRespPort->trySendRetry(); + respPort->trySendRetry(); updateQ->sendPktRetry = false; } auto pkt = queue.front() @@ -144,7 +201,8 @@ WLEngine::processNextWLReduceEvent(){ if (*value != *prop){ //update prop with temp_prop *temp_prop = min(*value , *temp_prop); - RequestPtr req = std::make_shared(pkt->getAddr(), 64, 0 ,0); + RequestPtr req = + std::make_shared(pkt->getAddr(), 64, 0 ,0); PacketPtr writePkt = new Packet(req, MemCmd::WriteReq); writePkt->setData(data); if (!memPort->blocked() && !applyPort->blocked()){ @@ -171,15 +229,3 @@ WLEngine::processNextWLReduceEvent(){ } } - -void -WLEngine::WLRespPort::trySendRetry() -{ - sendRetryReq(); -} - -void -WLEngine::WLMemPort::trySendRetry() -{ - sendRetryResp(); -} \ No newline at end of file diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh index 7269965ff2..3f39ec7ee8 100644 --- a/src/accl/wl_engine.hh +++ b/src/accl/wl_engine.hh @@ -66,7 +66,6 @@ class WLEngine : public ClockedObject { private: WLEngine *owner; - bool _blocked; PacketPtr blockedPacket; public: @@ -74,11 +73,11 @@ class WLEngine : public ClockedObject PortID id=InvalidPortID); virtual AddrRangeList getAddrRanges(); + void trySendRetry(); + + protected: virtual bool recvTimingReq(PacketPtr pkt); - bool blocked(){ - return _blocked; - } - } + }; class WLReqPort : public RequestPort //To Apply Engine { @@ -86,15 +85,19 @@ class WLEngine : public ClockedObject WLEngine *owner; bool _blocked; PacketPtr blockedPacket; + public: WLReqPort(const std::string& name, SimObject* _owner, PortID id=InvalidPortID); - void trySendRetry(); - virtual bool recvTimingResp(PacketPtr pkt); + void sendPacket(PacketPtr pkt); bool blocked(){ return _blocked; } - } + + protected: + void recvReqRetry() override; + virtual bool recvTimingResp(PacketPtr pkt); + }; class WLMemPort : public RequestPort { @@ -102,16 +105,21 @@ class WLEngine : public ClockedObject WLEngine *owner; bool _blocked; PacketPtr blockedPacket; + public: WLMemPort(const std::string& name, SimObject* _owner, PortID id=InvalidPortID); void sendPacket(PacktPtr pkt); - virtual bool recvTimingResp(PacketPtr pkt); void trySendRetry(); bool blocked(){ return _blocked; } - } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + void recvReqRetry() override; + }; + bool handleWLU(PacketPtr pkt); bool sendPacket(); //one queue for write and one for read a priotizes write over read @@ -131,13 +139,27 @@ class WLEngine : public ClockedObject read + write Write edgelist loc in buffer */ + void processNextWLReadEvent(); + EventFunctionWrapper nextWLReadEvent; + + void processNextWLReduceEvent(); + EventFunctionWrapper nextWLReduceEvent; + + AddrRangeList getAddrRanges() const; WLQueue wlReadQueue; WLQueue wlWriteQueue; WLMemPort memPort; - std::pair + + WLMemPort memPort; + WLRespPort respPort; + WLRequestPort reqPort; + public: - WLEngine(const WLEngineParams ¶ms); //fix this + + WLEngine(const WLEngineParams ¶ms); + Port &getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; }; #endif // __ACCL_WLE_HH__ \ No newline at end of file From 23e3f42ae186681dedf173e0b42a20bd6b918ab2 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Sun, 13 Feb 2022 13:06:45 -0800 Subject: [PATCH 010/247] Changing some small errors --- src/accl/wl_engine.cc | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc index fbf201720d..e49ad44bf1 100644 --- a/src/accl/wl_engine.cc +++ b/src/accl/wl_engine.cc @@ -162,7 +162,6 @@ void WLEngine::processNextWLReadEvent(){ break; } } - } bool @@ -188,12 +187,8 @@ WLEngine::processNextWLReduceEvent(){ auto updateQ = wlReadQueue; applyPort = reqPort; while(!queue.empty()){ - auto update = updateQ.pop() - if (!updateQ->blocked() & updateQ->sendPktRetry){ - respPort->trySendRetry(); - updateQ->sendPktRetry = false; - } - auto pkt = queue.front() + auto update = updateQ.front(); + auto pkt = queue.front(); uint64_t* updatePtr = pkt->getPtr(); uint64_t* data = pkt->getPtr(); uint32_t* value = updatePtr; @@ -213,6 +208,11 @@ WLEngine::processNextWLReduceEvent(){ memPort->trySendRetry(); queue->sendPktRetry = false; } + updateQ.pop(); + if (!updateQ->blocked() & updateQ->sendPktRetry){ + respPort->trySendRetry(); + updateQ->sendPktRetry = false; + } } else break; @@ -223,6 +223,11 @@ WLEngine::processNextWLReduceEvent(){ memPort->trySendRetry(); queue->sendPktRetry = false; } + updateQ.pop() + if (!updateQ->blocked() & updateQ->sendPktRetry){ + respPort->trySendRetry(); + updateQ->sendPktRetry = false; + } } From 495fc758be9b02fa2e4d8187c57d486c70aa78e3 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Sun, 13 Feb 2022 17:39:58 -0800 Subject: [PATCH 011/247] [wip] using util in the creating memory packets --- src/accl/apply.cc | 69 ++++++++++++++++++------------ src/accl/apply.hh | 6 +++ src/accl/util.cc | 43 +++++++++++++++++++ src/accl/util.hh | 3 +- src/accl/wl_engine.cc | 97 ++++++++++++++++++++++++------------------- src/accl/wl_engine.hh | 10 ++++- 6 files changed, 155 insertions(+), 73 deletions(-) diff --git a/src/accl/apply.cc b/src/accl/apply.cc index 6ad630f0ac..6b474d5628 100644 --- a/src/accl/apply.cc +++ b/src/accl/apply.cc @@ -32,6 +32,8 @@ Apply::Apply(const ApplyParams ¶ms): ClockedObject(params), + system(params.system), + requestorId(system->getRequestorId(this)), reqPort(name() + ".reqPort", this), respPort(name() + ".respPort", this), memPort(name() + ".memPort", this), @@ -145,20 +147,25 @@ bool Apply::handleWL(PacketPtr pkt){ void Apply::processNextApplyCheckEvent(){ auto queue = applyReadQueue; - while(!queue.empty()){ - if(!memPort->blocked()){ - auto pkt = queue.pop(); - if(queue->sendPktRetry && !queue->blocked()){ - respPort->trySendRetry(); - queue->sendPktRetry = false; - } - // conver to ReadReq - RequestPtr req = std::make_shared(pkt->getAddr(), 64, 0 ,0); - PacketPtr memPkt = new Packet(req, MemCmd::ReadReq); - memPort->sendPacket(memPkt); + if(!memPort->blocked()){ + auto pkt = queue.pop(); + if(queue->sendPktRetry && !queue->blocked()){ + respPort->trySendRetry(); + queue->sendPktRetry = false; } - else - break; + // conver to ReadReq + Addr req_addr = (pkt->getAddr() / 64) * 64; + int req_offset = (pkt->getAddr()) % 64; + RequestPtr req = std::make_shared(req_addr, 64, 0 ,0); + PacketPtr memPkt = new Packet(req, MemCmd::ReadReq); + requestOffset[req] = req_offset; + memPort->sendPacket(memPkt); + } + else{ + break; + } + if (!queue.empty() && !nextApplyCheckEvent.scheduled()){ + schedule(nextApplyCheckEvent, nextCycle()); } } @@ -183,21 +190,27 @@ Apply::handleMemResp(PacktPtr pkt) void Apply::processNextApplyEvent(){ auto queue = applyWriteQueue; - while(!queue.empty()){ auto pkt = queue.front(); - uint64_t* data = pkt->getPtr(); - uint32_t* prop = data; - uint32_t* temp_prop = prop + 1; - if (*temp_prop != *prop){ - //update prop with temp_prop - *prop = min(*prop , *temp_prop); - RequestPtr req = - std::make_shared(pkt->getAddr(), 64, 0 ,0); - PacketPtr writePkt = new Packet(req, MemCmd::WriteReq); - writePkt->setData(data); - if (!memPort->blocked() && !reqPort->blocked()){ //re-think this - memPort->sendPacket(pkt); - applyReqPort->sendPacket(pkt); + uint8_t* data = pkt->getPtr(); + + RequestPtr req = pkt->req; + int request_offset = requestOffset[req]; + WorkListItem wl = memoryToWorkList(data + request_offset); + uint32_t prop = wl.prop; + uint32_t temp_prop = wl.temp_prop; + + if (temp_prop != prop){ + if (!memPort->blocked() && !reqPort->blocked()){ + //update prop with temp_prop + wl.prop = min(prop , temp_prop); + //write back the new worklist item to memory + uint8_t* wList = workListToMemory(wl); + memcpy(data + request_offset, wList, sizeof(WorkListItem)); + //Create memory write requests. + PacketPtr writePkt = + getWritePacket(pkt->getAddr(), 64, data, requestorId); + memPort->sendPacket(writePkt); + applyReqPort->sendPacket(writePkt); queue.pop(); if(queue->sendPktRetry && !queue->blocked()){ memPort->trySendRetry(); @@ -214,5 +227,7 @@ Apply::processNextApplyEvent(){ queue->sendPktRetry = false; } } + if(!queue.empty() && !nextApplyEvent.scheduled()){ + schedule(nextApplyEvent, nextCycle()); } } \ No newline at end of file diff --git a/src/accl/apply.hh b/src/accl/apply.hh index dae3d8ec0e..b213d37667 100644 --- a/src/accl/apply.hh +++ b/src/accl/apply.hh @@ -32,6 +32,7 @@ #include #include +#include "accl/util.hh" #include "base/addr_range_map.hh" #include "base/statistics.hh" #include "mem/port.hh" @@ -146,6 +147,9 @@ class Apply : public ClockedObject void processNextApplyCheckEvent(); EventFunctionWrapper nextApplyCheckEvent; + System* const system; + const RequestorID requestorId; + AddrRangeList getAddrRanges() const; ApplyQueue applyReadQueue; @@ -155,6 +159,8 @@ class Apply : public ClockedObject ApplyRespPort respPort; ApplyRequestPort reqPort; + std::unordered_map requestOffset; + public: Apply(const ApplyParams &apply); Port &getPort(const std::string &if_name, diff --git a/src/accl/util.cc b/src/accl/util.cc index 8d975c482f..8debd3a937 100644 --- a/src/accl/util.cc +++ b/src/accl/util.cc @@ -42,3 +42,46 @@ getReadPacket(Addr addr, unsigned int size, RequestorID requestorId) return pkt; } + +PacketPtr getWritePacket(Addr addr, + unsigned int size, + uint8_t* data, + RequestorID requestorId) +{ + equestPtr req = std::make_shared(addr, size, 0, + requestorId); + req->setPC(((Addr)requestorId) << 2); + + PacketPtr pkt = new Packet(req, MemCmd::WriteReq); + pkt->allocate(); + pkt->setData(data); + + return pkt; +} + +WorkListItem& +memoryToWorkList(uint8_t* data){ + WorkListItem wl; + uint32_t temp_prop = *((uint32_t*) data)); + + uint32_t prop = *((uint32_t*) (data + 4)); + + uint32_t degree = *((uint32_t*) (data + 8)); + + uint32_t addr = *((uint32_t*) (data + 12)); + + retrun wl = {temp_prop, prop, degree, addr}; +} + +unit8_t* +workListToMemory(WorkListItem wl){ + int data_size = sizeof(WorkListItem)/sizeof(uint_8) + uint_8* data = new uint8_t [data_size]; + uint_32* wList = (uint_32*)data; + *wList = wl.prop; + *wList + 1 = wl.temp_prop; + *wList + 2 = wl.degree; + *wList + 3 = wl.edgeIndex; + + return data; +} \ No newline at end of file diff --git a/src/accl/util.hh b/src/accl/util.hh index 18b8e4c197..00ccb7ddd9 100644 --- a/src/accl/util.hh +++ b/src/accl/util.hh @@ -34,7 +34,7 @@ struct WorkListItem uint32_t temp_prop; uint32_t prop; uint32_t degree; - Addr edgeList; + uint32_t edgeIndex; } struct Edge @@ -44,6 +44,7 @@ struct Edge } WorkListItem& memoryToWorkList(uint8_t* data); +unit8_t* workListToMemory(WorkListItem wl); Edge& memoryToEdge(uint8_t* data); PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId); diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc index e49ad44bf1..7d6d707ae6 100644 --- a/src/accl/wl_engine.cc +++ b/src/accl/wl_engine.cc @@ -33,6 +33,8 @@ WLEngine::WLEngine(const WLEngineParams ¶ms): ClockedObject(params), + system(params.system), + requestorId(system->getRequestorId(this)), reqPort(name() + ".reqPort", this), respPort(name() + ".respPort", this), memPort(name() + ".memPort", this), @@ -40,8 +42,8 @@ WLEngine::WLEngine(const WLEngineParams ¶ms): nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()), queueSize(params.wlQueueSize) //add this to .py { - wlReadQueue(queueSize); - wlWriteQueue(queueSize); + updateQueue(queueSize); + responseQueue(queueSize); } Port & @@ -135,7 +137,7 @@ WLEngine::getAddrRanges() const } bool WLEngine::handleWLUpdate(PacketPtr pkt){ - auto queue = wlReadQueue; + auto queue = updateQueue; if (queue->blocked()){ queue->sendPktRetry = true; return false; @@ -149,25 +151,32 @@ bool WLEngine::handleWLUpdate(PacketPtr pkt){ } void WLEngine::processNextWLReadEvent(){ - auto queue = wlReadQueue; + auto queue = updateQueue; memPort = WLMemPort while(!queue.empty()){ //create a map instead of front auto pkt = queue.front() /// conver to ReadReq + Addr req_addr = (pkt->getAddr() / 64) * 64; + int req_offset = (pkt->getAddr()) % 64; RequestPtr req = - std::make_shared(pkt->getAddr(), 64, 0 ,0); + std::make_shared(req_addr, 64, 0 ,0); PacketPtr memPkt = new Packet(req, MemCmd::ReadReq); + requestOffset[req] = req_offset; if (!memPort->blocked()){ + queue.pop() memPort->sendPacket(memPkt); break; } } + if(!queue.empty() && !nextWLReadEvent.scheduled()){ + schedule(nextWLReadEvent, nextCycle()); + } } bool WLEngine::handleMemResp(PacktPtr pkt) { - auto queue = wlWriteQueue; + auto queue = responseQueue; if (queue->blocked()){ sendPktRetry = true; return false; @@ -183,54 +192,56 @@ WLEngine::handleMemResp(PacktPtr pkt) void WLEngine::processNextWLReduceEvent(){ - auto queue = wlWriteQueue; - auto updateQ = wlReadQueue; + auto queue = responseQueue; + auto updateQ = updateQueue; applyPort = reqPort; - while(!queue.empty()){ - auto update = updateQ.front(); - auto pkt = queue.front(); - uint64_t* updatePtr = pkt->getPtr(); - uint64_t* data = pkt->getPtr(); - uint32_t* value = updatePtr; - uint32_t* temp_prop = prop + 1; - if (*value != *prop){ - //update prop with temp_prop - *temp_prop = min(*value , *temp_prop); - RequestPtr req = - std::make_shared(pkt->getAddr(), 64, 0 ,0); - PacketPtr writePkt = new Packet(req, MemCmd::WriteReq); - writePkt->setData(data); - if (!memPort->blocked() && !applyPort->blocked()){ - memPort->sendPacket(pkt); - applyPort->sendPacket(pkt); - queue.pop(); - if (!queue->blocked() && queue->sendPktRetry){ - memPort->trySendRetry(); - queue->sendPktRetry = false; - } - updateQ.pop(); - if (!updateQ->blocked() & updateQ->sendPktRetry){ - respPort->trySendRetry(); - updateQ->sendPktRetry = false; - } - } - else - break; - } - else{ + auto update = updateQ.front(); + auto value = update->getPtr(); + auto pkt = queue.front(); + uint8_t* data = pkt->getPtr(); + RequestPtr req = pkt->req; + int request_offset = requestOffset[req]; + WorkListItem wl = memoryToWorkList(data + request_offset) + uint32_t temp_prop = wl.temp_prop; + if (temp_prop != *value){ + //update prop with temp_prop + temp_prop = min(value , temp_prop); + if (!memPort->blocked() && !applyPort->blocked()){ + wl.temp_prop = temp_prop; + unit8_t* wlItem = workListToMemory(wl); + memcpy(data + request_offset, wlItem, sizeof(WorkListItem)); + PacketPtr writePkt = + getWritePacket(pkt->getAddr(), 64, data, requestorId); + memPort->sendPacket(writePkt); + applyPort->sendPacket(writePkt); queue.pop(); if (!queue->blocked() && queue->sendPktRetry){ memPort->trySendRetry(); queue->sendPktRetry = false; } - updateQ.pop() + updateQ.pop(); if (!updateQ->blocked() & updateQ->sendPktRetry){ respPort->trySendRetry(); updateQ->sendPktRetry = false; } - } - + else + break; } + else{ + queue.pop(); + if (!queue->blocked() && queue->sendPktRetry){ + memPort->trySendRetry(); + queue->sendPktRetry = false; + } + updateQ.pop() + if (!updateQ->blocked() & updateQ->sendPktRetry){ + respPort->trySendRetry(); + updateQ->sendPktRetry = false; + } + } + if(!queue && !nextWLReduceEvent.scheduled()){ + schedule(nextWLReduceEvent, nextCycle()); + } } diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh index 3f39ec7ee8..7132283463 100644 --- a/src/accl/wl_engine.hh +++ b/src/accl/wl_engine.hh @@ -32,6 +32,7 @@ #include #include +#include "accl/util.hh" #include "base/addr_range_map.hh" #include "base/statistics.hh" #include "mem/port.hh" @@ -39,6 +40,7 @@ #include "params/MPU.hh" #include "sim/clocked_object.hh" + class WLEngine : public ClockedObject { private: @@ -145,10 +147,14 @@ class WLEngine : public ClockedObject void processNextWLReduceEvent(); EventFunctionWrapper nextWLReduceEvent; + System* const system; + const RequestorID requestorId; + std::unordered_map requestOffset; + AddrRangeList getAddrRanges() const; - WLQueue wlReadQueue; - WLQueue wlWriteQueue; + WLQueue updateQueue; + WLQueue responseQueue; WLMemPort memPort; WLMemPort memPort; From 394ffeb71c32901ae564babeadbcd5b6883fb5e5 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 13 Feb 2022 21:15:35 -0800 Subject: [PATCH 012/247] Completing PushEngine. --- src/accl/push_engine.cc | 174 ++++++++++++++++++++++++++++++---------- src/accl/push_engine.hh | 24 ++++-- src/accl/util.cc | 43 +++++++++- src/accl/util.hh | 6 +- src/mem/packet.hh | 2 + 5 files changed, 196 insertions(+), 53 deletions(-) diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc index bc3138f61e..cd5f73eea3 100644 --- a/src/accl/push_engine.cc +++ b/src/accl/push_engine.cc @@ -26,26 +26,25 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include "accl/util.hh" #include "accl/push_engine.hh" - #include "debug/PushEngine.hh" -PushEngine::PushEngine(const PushEngineParams& params): - ClockedObject(params), +PushEngine::PushEngine(const PushEngineParams ¶ms) : ClockedObject(params), system(params.system), requestorId(system->getRequestorId(this)), reqPort(name() + ".reqPort", this), respPort(name() + ".respPort", this), memPort(name() + ".memPort", this), - vertexQueueSize(params.vertex_queue_size), - vertexQueueLen(0), - updateQueue(params.update_queue_size), - updateQueueLen(0), - nextReceiveEvent([this]{ processNextReceiveEvent(); }, name()), - nextReadEvent([this]{ processNextReadEvent(); }, name()), - nextCreateEvent([this]{ processNextCreateEvent(); }, name()), - nextSendEvent([this]{ processNextSendEvent(); }, name()) -{} + // vertexQueueSize(params.vertex_queue_size), + // vertexQueueLen(0), + // updateQueue(params.update_queue_size), + // updateQueueLen(0), + nextReceiveEvent([this] { processNextReceiveEvent(); }, name()), + nextReadEvent([this] { processNextReadEvent(); }, name()), + nextSendEvent([this] { processNextSendEvent(); }, name()) +{ +} Port & PushEngine::getPort(const std::string &if_name, PortID idx) @@ -61,60 +60,151 @@ PushEngine::getPort(const std::string &if_name, PortID idx) } } -bool -PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt) +bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt) { return owner->handleUpdate(pkt); } -bool -PushEngine::handleUpdate(PacketPtr pkt) +AddrRangeList +PushEngine::PushRespPort::getAddrRanges() { - if (vertexQueueLen < vertexQueueSize) { - vertexQueue.push(pkt) - vertexQueueLen++; - return true; + owner->memPort->getAddrRanges(); +} - if (!nextReceiveEvent.scheduled()){ - schedule(nextReceiveEvent, nextCycle()); - } +bool PushEngine::handleUpdate(PacketPtr pkt) +{ + // if (vertexQueueLen < vertexQueueSize) { + // vertexQueue.push(pkt) + // vertexQueueLen++; + // if (!nextReceiveEvent.scheduled()) { + // schedule(nextReceiveEvent, nextCycle()); + // } + // return true; + // } + // return false; + vertexQueue.push(pkt) + if (!nextReceiveEvent.scheduled()) { + schedule(nextReceiveEvent, nextCycle()); } - return false; + return true; } -void -PushEngine::processNextReceiveEvent() +void PushEngine::processNextReceiveEvent() { PacketPtr updatePkt = vertexQueue.pop(); - uint8_t* data = updatePkt->getData(); - - Addr edgeListAddr = ; // TODO: Generalize finding this address. - int outDegree = ; // TODO: Generalize finding this value. - - Addr reqAddr = (edgeListAddr / 64) * 64; - Addr offsetAddr = edgeListAddr % 64; + uint8_t *data = updatePkt->getData(); + + // data: (edge_index: 32 bits, degree: 32 bits, value: 32 bits) + uint32_t edge_index = *((uint32_t *)data); + uint32_t degree = *((uint32_t *)(data + 4)); + uint32_t value = *((uint32_t *)(data + 8)); + + std::vector addr_queue; + std::vector offset_queue; + std::vector num_edge_queue; + + for (uint32_t index = 0; index < degree; index++) { + Addr edge_addr = (edge_index + index) * sizeof(Edge); + Addr req_addr = (edge_addr / 64) * 64; + Addr req_offset = edge_addr % 64; + if (addr_queue.size()) { + if (addr_queue.back() == req_addr) { + num_edge_queue.back()++; + } + else { + addr_queue.push(req_addr); + offset_queue.push(req_offset); + num_edge_queue.push(1); + } + } + else { + addr_queue.push(req_addr); + offset_queue.push(req_offset); + num_edge_queue.push(1); + } + } - PacketPtr pkt = getReadPacket(reqAddr, 64, requestorId); + for (int index = 0; index < addr_queue.size(); inedx++) { + PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId); + memReqQueue.push(pkt); + reqOffsetMap[pkt->req] = offset_queue[index]; + reqNumEdgeMap[pkt->req] = num_edge_queue[index]; + reqValueMap[pkt->req] = value; + } - memPort.sendPacket(pkt); + if (!nextReadEvent.scheduled() && !memReqQueue.empty()) { + schedule(nextReadEvent, nextCycle()); + } +} +void PushEngine::processNextReadEvent() +{ + PacketPtr pkt = memReqQueue.front(); + if (!memPort.blocked()) { + memPort.sendPacket(pkt); + memReqQueue.pop(); + } + if (!nextReadEvent.scheduled() && !memReqQueue.empty()) { + schedule(nextReadEvent, nextCycle()); + } } -void -PushEngine::processNextReadEvent() +bool PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt) { + return owner->handleMemResp(pkt); +} +void PushEngine::PushMemPort::sendPacket(PacketPtr pkt) +{ + panic_if(_blocked, "Should never try to send if blocked MemSide!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + blockedPacket = pkt; + DPRINTF(MemScheduler, "Setting blocked to true on port %s\n", + this->name()); + _blocked = true; + } } -void -PushEngine::processNextCreateEvent() +void PushEngine::handleMemResp(PacketPtr pkt) { + RequestPtr req = pkt->req; + uint8_t *data = pkt->getPtr(); + + Addr offset = reqOffsetMap[req]; + int num_edges = reqNumEdgeMap[req]; + uint32_t value = reqValueMap[req]; + + int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t); + for (int i = 0; i < num_edges; i++) { + uint8_t *curr_edge_data = data + offset + i * edge_in_bytes; + Edge e = memoryToEdge(curr_edge_data); + uint32_t *update_data = new uint32_t; + + // TODO: Implement propagate function here + *update_data = value + 1; + PacketPtr update = getUpdatePacket(e.neighbor, + sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data); + updateQueue.push(update); + } + if (!nextSendEvent.scheduled() && !updateQueue.empty()) { + schedule(nextSendEvent, nextCycle()); + } } -void -PushEngine::processNextSendEvent() + +void PushEngine::processNextSendEvent() { + PacketPtr pkt = updateQueue.front(); + if (!reqPort.blocked()) { + reqPort.sendPacket(pkt); + updateQueue.pop(); + } -} \ No newline at end of file + if (!nextSendEvent.scheduled() && !updateQueue.empty()) { + schedule(nextSendEvent, nextCycle()); + } +} diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh index 6ab902d0e2..a746dcc265 100644 --- a/src/accl/push_engine.hh +++ b/src/accl/push_engine.hh @@ -51,6 +51,7 @@ class PushEngine : public ClockedObject PacketPtr blockedPacket; public: + //TODO: Implement this; PushRespPort(const std::string& name, SimObject* _owner, PortID id=InvalidPortID); @@ -65,6 +66,7 @@ class PushEngine : public ClockedObject PacketPtr blockedPacket; public: + // TODO: Implement this; PushReqPort(const std::string& name, SimObject* _owner, PortID id=InvalidPortID); @@ -78,9 +80,12 @@ class PushEngine : public ClockedObject PacketPtr blockedPacket; public: + // TODO: Implement this; PushMemPort(const std::string& name, SimObject* _owner, PortID id=InvalidPortID); - bool sendPacket(PacktPtr pkt); + + void sendPacket(PacktPtr pkt); + bool blocked() { return _blocked; } virtual bool recvTimingResp(PacketPtr pkt); } @@ -93,12 +98,18 @@ class PushEngine : public ClockedObject PushMemPort memPort; std::queue vertexQueue; - int vertexQueueSize; - int vertexQueueLen; + // int vertexQueueSize; + // int vertexQueueLen; + + std::unordered_map reqOffsetMap; + std::unordered_map reqNumEdgeMap; + std::unordered_map reqValueMap; + + std::queue memReqQueue; // Infinite queueing? std::queue updateQueue; - int updateQueueSize; - int updateQueueLen; + // int updateQueueSize; + // int updateQueueLen; EventFunctionWrapper nextReceiveEvent; void processNextReceiveEvent(); @@ -106,9 +117,6 @@ class PushEngine : public ClockedObject EventFunctionWrapper nextReadEvent; void processNextReadEvent(); - EventFunctionWrapper nextCreateEvent; - void processNextCreateEvent(); - EventFunctionWrapper nextSendEvent; void processNextSendEvent(); diff --git a/src/accl/util.cc b/src/accl/util.cc index 8debd3a937..76ed6269c2 100644 --- a/src/accl/util.cc +++ b/src/accl/util.cc @@ -28,6 +28,34 @@ #include "accl/util.hh" + +// Edge: (weight: 64 bits, neighbor: 64 bits) +Edge& +memoryToEdge(uint8_t *data) +{ + uint64_t weight = *((uint64_t*) data); + Addr neighbor = *((Addr*) (data + 8)); // data + 8 because weight: 8 bytes + Edge e = {weight, neighbor}; + return e; +} + +// Edge: (weight: 64 bits, neighbor: 64 bits) +uint8_t* +edgeToMemory(Edge e) +{ + int data_size = (int) ((sizeof(Edge)) / (sizeof(uint8_t))); + + uint8_t* data = new uint8_t [data_size]; + + uint64_t* weightPtr = (uint64_t*) data; + *weightPtr = e.weight; + + Addr* neighborPtr = (Addr*) (data + 8); // data + 8 because weight: 8 bytes + *neighborPtr = e.neighbor; + + return data; +} + PacketPtr getReadPacket(Addr addr, unsigned int size, RequestorID requestorId) { @@ -43,6 +71,7 @@ getReadPacket(Addr addr, unsigned int size, RequestorID requestorId) return pkt; } + PacketPtr getWritePacket(Addr addr, unsigned int size, uint8_t* data, @@ -53,6 +82,18 @@ PacketPtr getWritePacket(Addr addr, req->setPC(((Addr)requestorId) << 2); PacketPtr pkt = new Packet(req, MemCmd::WriteReq); + +PacketPtr +getUpdatePacket(Addr addr, unsigned int size, uint8_t *data) +{ + RequestPtr req = std::make_shared(addr, size, 0, + requestorId); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr)requestorId) << 2); + + PacketPtr pkt = new Packet(req, MemCmd::UpdateWL); + pkt->allocate(); pkt->setData(data); @@ -84,4 +125,4 @@ workListToMemory(WorkListItem wl){ *wList + 3 = wl.edgeIndex; return data; -} \ No newline at end of file +} diff --git a/src/accl/util.hh b/src/accl/util.hh index 00ccb7ddd9..c309d4967a 100644 --- a/src/accl/util.hh +++ b/src/accl/util.hh @@ -39,13 +39,15 @@ struct WorkListItem struct Edge { - uint32_t weight; + uint64_t weight; Addr neighbor; } WorkListItem& memoryToWorkList(uint8_t* data); unit8_t* workListToMemory(WorkListItem wl); + Edge& memoryToEdge(uint8_t* data); +uint8_t* edgeToMemory(Edge e); PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId); -PacketPtr getWritePacket(Addr addr, unsigned int size, uint8_t* data, RequestorID requestorId); +PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data); \ No newline at end of file diff --git a/src/mem/packet.hh b/src/mem/packet.hh index 9238dbec00..5332ee32a2 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -148,6 +148,8 @@ class MemCmd HTMAbort, // Tlb shootdown TlbiExtSync, + // MPU Accelerator + UpdateWL, NUM_MEM_CMDS }; From a13dcdb4c82d5a6d75eede265f42364ddb13f01a Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Mon, 14 Feb 2022 10:20:19 -0800 Subject: [PATCH 013/247] arch: Accelerator [wip] Adding Sconscript, debugging Change-Id: I0cef6e8745ca8f58a17a01d71dfb090fe1a7e606 --- src/accl/PushEngine.py | 39 ++++++++++++++++++++++ src/accl/SConscript | 36 ++++++++++++++++++++ src/accl/apply.cc | 74 +++++++++++++++++++---------------------- src/accl/apply.hh | 24 +++++++++---- src/accl/push_engine.cc | 2 +- src/accl/util.cc | 2 ++ src/accl/util.hh | 7 ++-- src/accl/wl_engine.cc | 71 +++++++++++++++++++-------------------- src/accl/wl_engine.hh | 20 +++++++---- 9 files changed, 180 insertions(+), 95 deletions(-) create mode 100644 src/accl/PushEngine.py create mode 100644 src/accl/SConscript diff --git a/src/accl/PushEngine.py b/src/accl/PushEngine.py new file mode 100644 index 0000000000..37639377c1 --- /dev/null +++ b/src/accl/PushEngine.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.SimObject import SimObject +from m5.objects.ClockedObject import ClockedObject + +class PushEngine(ClockedObject): + type = 'PushEngine' + cxx_header = "accl/push_engine.hh" + cxx_class = 'gem5::PushEngine' + + respPort = ResponsePort("Receives requests from WorkList") + reqPort = RequestPort("Sends requests to Push") + memPort = RequestPort("Memory side port, sends requests") diff --git a/src/accl/SConscript b/src/accl/SConscript new file mode 100644 index 0000000000..da0774ca44 --- /dev/null +++ b/src/accl/SConscript @@ -0,0 +1,36 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2016 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Import('*') + +SimObject('WLEngine.py') +# SimObject('Apply.py') +# SimObject('PushEngine.py') + +# Source('apply.cc') +Source('wl_engine.cc') +# Source('push_engine.cc') diff --git a/src/accl/apply.cc b/src/accl/apply.cc index 6b474d5628..985e6217d7 100644 --- a/src/accl/apply.cc +++ b/src/accl/apply.cc @@ -38,11 +38,10 @@ Apply::Apply(const ApplyParams ¶ms): respPort(name() + ".respPort", this), memPort(name() + ".memPort", this), nextApplyEvent([this]{processNextApplyEvent; }, name()), - nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name()), - queueSize(params.applyQueueSize) //add this to .py + nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name()) { - applyReadQueue(queueSize); - applyWriteQueue(queueSize); + applyReadQueue(params.applyQueueSize); + applyWriteQueue(params.applyQueueSize); } Port & @@ -110,7 +109,7 @@ Apply::ApplyMemPort::recvReqRetry() } void -WLEngine::ApplyReqPort::sendPacket(PacketPtr pkt) +Apply::ApplyReqPort::sendPacket(PacketPtr pkt) { if (!sendTimingReq(pkt)) { blockedPacket = pkt; @@ -119,7 +118,7 @@ WLEngine::ApplyReqPort::sendPacket(PacketPtr pkt) } void -Apply::ApplyReqtPort::recvReqRetry() +Apply::ApplyReqPort::recvReqRetry() { _blocked = false; sendPacket(blockedPacket); @@ -134,12 +133,13 @@ Apply::getAddrRanges() const bool Apply::handleWL(PacketPtr pkt){ auto queue = applyReadQueue; - if (queue->blocked()){ + if (queue.blocked()){ sendPktRetry = true; return false; - } else - queue->push(pkt); - if(!nextApplyCheckEvent.scheduled()){ + } else{ + queue.push(pkt); + } + if (!nextApplyCheckEvent.scheduled()){ schedule(nextApplyCheckEvent, nextCycle()); } return true; @@ -147,22 +147,19 @@ bool Apply::handleWL(PacketPtr pkt){ void Apply::processNextApplyCheckEvent(){ auto queue = applyReadQueue; - if(!memPort->blocked()){ + if (!memPort.blocked()){ auto pkt = queue.pop(); - if(queue->sendPktRetry && !queue->blocked()){ - respPort->trySendRetry(); - queue->sendPktRetry = false; + if (queue.sendPktRetry && !queue.blocked()){ + respPort.trySendRetry(); + queue.sendPktRetry = false; } // conver to ReadReq Addr req_addr = (pkt->getAddr() / 64) * 64; int req_offset = (pkt->getAddr()) % 64; - RequestPtr req = std::make_shared(req_addr, 64, 0 ,0); - PacketPtr memPkt = new Packet(req, MemCmd::ReadReq); - requestOffset[req] = req_offset; - memPort->sendPacket(memPkt); - } - else{ - break; + RequestPtr request = std::make_shared(req_addr, 64, 0 ,0); + PacketPtr memPkt = new Packet(request, MemCmd::ReadReq); + requestOffset[request] = req_offset; + memPort.sendPacket(memPkt); } if (!queue.empty() && !nextApplyCheckEvent.scheduled()){ schedule(nextApplyCheckEvent, nextCycle()); @@ -174,11 +171,11 @@ Apply::handleMemResp(PacktPtr pkt) { auto queue = applyWriteQueue; - if (queue->blocked()){ + if (queue.blocked()){ sendPktRetry = true; return false; } else - queue->push(writePkt); + queue.push(pkt); if(!nextApplyEvent.scheduled()){ schedule(nextApplyEvent, nextCycle()); @@ -193,41 +190,38 @@ Apply::processNextApplyEvent(){ auto pkt = queue.front(); uint8_t* data = pkt->getPtr(); - RequestPtr req = pkt->req; - int request_offset = requestOffset[req]; + RequestPtr request = pkt->req; + int request_offset = requestOffset[request]; WorkListItem wl = memoryToWorkList(data + request_offset); uint32_t prop = wl.prop; uint32_t temp_prop = wl.temp_prop; if (temp_prop != prop){ - if (!memPort->blocked() && !reqPort->blocked()){ + if (!memPort.blocked() && !reqPort.blocked()){ //update prop with temp_prop - wl.prop = min(prop , temp_prop); + wl.prop = std::min(prop , temp_prop); //write back the new worklist item to memory uint8_t* wList = workListToMemory(wl); memcpy(data + request_offset, wList, sizeof(WorkListItem)); //Create memory write requests. PacketPtr writePkt = getWritePacket(pkt->getAddr(), 64, data, requestorId); - memPort->sendPacket(writePkt); - applyReqPort->sendPacket(writePkt); + memPort.sendPacket(writePkt); + applyReqPort.sendPacket(writePkt); queue.pop(); - if(queue->sendPktRetry && !queue->blocked()){ - memPort->trySendRetry(); - queue->sendPktRetry = false; + if (queue.sendPktRetry && !queue.blocked()){ + memPort.trySendRetry(); + queue.sendPktRetry = false; } } - else - break; - } - else{ + }else{ queue.pop(); - if(queue->sendPktRetry && !queue->blocked()){ - memPort->trySendRetry(); - queue->sendPktRetry = false; + if (queue.sendPktRetry && !queue.blocked()){ + memPort.trySendRetry(); + queue.sendPktRetry = false; } } if(!queue.empty() && !nextApplyEvent.scheduled()){ schedule(nextApplyEvent, nextCycle()); } -} \ No newline at end of file +} diff --git a/src/accl/apply.hh b/src/accl/apply.hh index b213d37667..f4dabd6a97 100644 --- a/src/accl/apply.hh +++ b/src/accl/apply.hh @@ -35,10 +35,12 @@ #include "accl/util.hh" #include "base/addr_range_map.hh" #include "base/statistics.hh" -#include "mem/port.hh" +#include "base/types.hh" #include "mem/packet.hh" -#include "params/MPU.hh" +#include "mem/port.hh" +#include "params/Apply.hh" #include "sim/clocked_object.hh" +#include "sim/port.hh" class Apply : public ClockedObject { @@ -46,17 +48,25 @@ class Apply : public ClockedObject struct ApplyQueue{ std::queue applyQueue; - const uint_32 queueSize; + const uint32_t queueSize; bool sendPktRetry; bool blocked(){ - return applyQueue.size() == queueSize; + return (applyQueue.size() == queueSize); } bool empty(){ - return applyQueue.empty(); + return applyQueue->empty(); } void push(PacketPtr pkt){ - applyQueue.push(pkt); + applyQueue->push(pkt); + } + + void pop(){ + applyQueue->pop(); + } + + void front(){ + applyQueue->front(); } ApplyQueue(uint32_t qSize): @@ -167,4 +177,4 @@ class Apply : public ClockedObject PortID idx=InvalidPortID) override; }; -#endif // __ACCL_APPLY_HH__ \ No newline at end of file +#endif // __ACCL_APPLY_HH__ diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc index cd5f73eea3..c02009d25a 100644 --- a/src/accl/push_engine.cc +++ b/src/accl/push_engine.cc @@ -28,7 +28,7 @@ #include "accl/util.hh" #include "accl/push_engine.hh" -#include "debug/PushEngine.hh" +// #include "debug/PushEngine.hh" PushEngine::PushEngine(const PushEngineParams ¶ms) : ClockedObject(params), system(params.system), diff --git a/src/accl/util.cc b/src/accl/util.cc index 76ed6269c2..92f6a3e351 100644 --- a/src/accl/util.cc +++ b/src/accl/util.cc @@ -28,6 +28,8 @@ #include "accl/util.hh" +#include "base/types.hh" +#include "mem/packet.hh" // Edge: (weight: 64 bits, neighbor: 64 bits) Edge& diff --git a/src/accl/util.hh b/src/accl/util.hh index c309d4967a..737d52e2a1 100644 --- a/src/accl/util.hh +++ b/src/accl/util.hh @@ -26,6 +26,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include "base/addr_range_map.hh" #include "base/types.hh" #include "mem/packet.hh" @@ -35,7 +36,7 @@ struct WorkListItem uint32_t prop; uint32_t degree; uint32_t edgeIndex; -} +}; struct Edge { @@ -44,10 +45,10 @@ struct Edge } WorkListItem& memoryToWorkList(uint8_t* data); -unit8_t* workListToMemory(WorkListItem wl); +uint8_t* workListToMemory(WorkListItem wl); Edge& memoryToEdge(uint8_t* data); uint8_t* edgeToMemory(Edge e); PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId); -PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data); \ No newline at end of file +PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data); diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc index 7d6d707ae6..757bdd2598 100644 --- a/src/accl/wl_engine.cc +++ b/src/accl/wl_engine.cc @@ -39,11 +39,10 @@ WLEngine::WLEngine(const WLEngineParams ¶ms): respPort(name() + ".respPort", this), memPort(name() + ".memPort", this), nextWLReadEvent([this]{processNextWLReadEvent; }, name()), - nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()), - queueSize(params.wlQueueSize) //add this to .py + nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()) { - updateQueue(queueSize); - responseQueue(queueSize); + updateQueue(params.wlQueueSize); + responseQueue(params.wlQueueSize); } Port & @@ -138,11 +137,11 @@ WLEngine::getAddrRanges() const bool WLEngine::handleWLUpdate(PacketPtr pkt){ auto queue = updateQueue; - if (queue->blocked()){ - queue->sendPktRetry = true; + if (queue.blocked()){ + queue.sendPktRetry = true; return false; } else - queue->push(pkt); + queue.push(pkt); if(!nextWLReadEvent.scheduled()){ schedule(nextWLReadEvent, nextCycle()); @@ -152,19 +151,19 @@ bool WLEngine::handleWLUpdate(PacketPtr pkt){ void WLEngine::processNextWLReadEvent(){ auto queue = updateQueue; - memPort = WLMemPort - while(!queue.empty()){ //create a map instead of front + auto memPort = WLMemPort; + while (!queue.empty()){ //create a map instead of front auto pkt = queue.front() /// conver to ReadReq Addr req_addr = (pkt->getAddr() / 64) * 64; int req_offset = (pkt->getAddr()) % 64; - RequestPtr req = + RequestPtr request = std::make_shared(req_addr, 64, 0 ,0); PacketPtr memPkt = new Packet(req, MemCmd::ReadReq); - requestOffset[req] = req_offset; - if (!memPort->blocked()){ + requestOffset[request] = req_offset; + if (!memPort.blocked()){ queue.pop() - memPort->sendPacket(memPkt); + memPort.sendPacket(memPkt); break; } } @@ -177,11 +176,11 @@ bool WLEngine::handleMemResp(PacktPtr pkt) { auto queue = responseQueue; - if (queue->blocked()){ + if (queue.blocked()){ sendPktRetry = true; return false; } else - queue->push(writePkt); + queue.push(writePkt); if(!nextWLReduceEvent.scheduled()){ schedule(nextWLReduceEvent, nextCycle()); @@ -199,49 +198,47 @@ WLEngine::processNextWLReduceEvent(){ auto value = update->getPtr(); auto pkt = queue.front(); uint8_t* data = pkt->getPtr(); - RequestPtr req = pkt->req; - int request_offset = requestOffset[req]; + RequestPtr request = pkt->req; + int request_offset = requestOffset[request]; WorkListItem wl = memoryToWorkList(data + request_offset) uint32_t temp_prop = wl.temp_prop; if (temp_prop != *value){ //update prop with temp_prop - temp_prop = min(value , temp_prop); - if (!memPort->blocked() && !applyPort->blocked()){ + temp_prop = std::min(value , temp_prop); + if (!memPort.blocked() && !applyPort.blocked()){ wl.temp_prop = temp_prop; - unit8_t* wlItem = workListToMemory(wl); + uint8_t* wlItem = workListToMemory(wl); memcpy(data + request_offset, wlItem, sizeof(WorkListItem)); PacketPtr writePkt = getWritePacket(pkt->getAddr(), 64, data, requestorId); - memPort->sendPacket(writePkt); - applyPort->sendPacket(writePkt); + memPort.sendPacket(writePkt); + applyPort.sendPacket(writePkt); queue.pop(); - if (!queue->blocked() && queue->sendPktRetry){ - memPort->trySendRetry(); - queue->sendPktRetry = false; + if (!queue.blocked() && queue.sendPktRetry){ + memPort.trySendRetry(); + queue.sendPktRetry = false; } updateQ.pop(); - if (!updateQ->blocked() & updateQ->sendPktRetry){ - respPort->trySendRetry(); - updateQ->sendPktRetry = false; + if (!updateQ.blocked() & updateQ.sendPktRetry){ + respPort.trySendRetry(); + updateQ.sendPktRetry = false; } } - else - break; } else{ queue.pop(); - if (!queue->blocked() && queue->sendPktRetry){ - memPort->trySendRetry(); - queue->sendPktRetry = false; + if (!queue.blocked() && queue.sendPktRetry){ + memPort.trySendRetry(); + queue.sendPktRetry = false; } updateQ.pop() - if (!updateQ->blocked() & updateQ->sendPktRetry){ - respPort->trySendRetry(); - updateQ->sendPktRetry = false; + if (!updateQ.blocked() & updateQ.sendPktRetry){ + respPort.trySendRetry(); + updateQ.sendPktRetry = false; } } - if(!queue && !nextWLReduceEvent.scheduled()){ + if (!queue.empty() && !nextWLReduceEvent.scheduled()){ schedule(nextWLReduceEvent, nextCycle()); } } diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh index 7132283463..0393cd4cb5 100644 --- a/src/accl/wl_engine.hh +++ b/src/accl/wl_engine.hh @@ -37,9 +37,9 @@ #include "base/statistics.hh" #include "mem/port.hh" #include "mem/packet.hh" -#include "params/MPU.hh" +#include "params/WLEngine.hh" #include "sim/clocked_object.hh" - +#include "sim/port.hh" class WLEngine : public ClockedObject { @@ -47,20 +47,26 @@ class WLEngine : public ClockedObject struct WLQueue{ std::queue wlQueue; - const uint_32 queueSize; + const uint32_t queueSize; bool sendPktRetry; bool blocked(){ - return wlQueue.size() == queueSize; + return (wlQueue.size() == queueSize); } bool empty(){ - return wlQueue.empty(); + return wlQueue->empty(); } void push(PacketPtr pkt){ - wlQueue.push(pkt); + wlQueue->push(pkt); + } + void pop(){ + wlQueue->pop(); + } + void front(){ + wlQueue.front()); } - WLReqPort(uint32_t qSize): + WLQueue(uint32_t qSize): queueSize(qSize){} }; From d65b96c0ab6fdd6763a6d940b8bcc8759153930e Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 14 Feb 2022 10:02:36 -0800 Subject: [PATCH 014/247] Addin simobject file and startup for PushEngine. --- src/accl/PushEngine.py | 11 ++++++----- src/accl/push_engine.cc | 37 ++++++++++++++++++++++++++++++++++++- src/accl/push_engine.hh | 3 +++ 3 files changed, 45 insertions(+), 6 deletions(-) diff --git a/src/accl/PushEngine.py b/src/accl/PushEngine.py index 37639377c1..3215fdaee2 100644 --- a/src/accl/PushEngine.py +++ b/src/accl/PushEngine.py @@ -26,14 +26,15 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from m5.params import * -from m5.SimObject import SimObject +from m5.proxy import * from m5.objects.ClockedObject import ClockedObject class PushEngine(ClockedObject): - type = 'PushEngine' + type = 'WLEngine' cxx_header = "accl/push_engine.hh" cxx_class = 'gem5::PushEngine' - respPort = ResponsePort("Receives requests from WorkList") - reqPort = RequestPort("Sends requests to Push") - memPort = RequestPort("Memory side port, sends requests") + system = Param.System(Parent.any, "The system object this push engine is a part of") + respPort = ResponsePort("Port to Receive updates from outside") + reqPort = RequestPort("Port to send updates to the outside") + memPort = RequestPort("Port to communicate with the memory") diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc index c02009d25a..f1f8f7698b 100644 --- a/src/accl/push_engine.cc +++ b/src/accl/push_engine.cc @@ -60,6 +60,40 @@ PushEngine::getPort(const std::string &if_name, PortID idx) } } +void +PushEngine::startup() +{ + WorkListItem vertices [5] = { + {0, 0, 3, 0}, // Addr: 0 + {0, 0, 1, 3}, // Addr: 16 + {0, 0, 1, 4}, // Addr: 32 + {0, 0, 0, 5}, // Addr: 48 + {0, 0, 0, 5} // Addr: 64 + }; + Edge edges [6] = { + {0, 16}, // Addr: 1048576 + {0, 32}, // Addr: 1048592 + {0, 48}, // Addr: 1048608 + {0, 32}, // Addr: 1048624 + {0, 64} // Addr: 1048640 + }; + + for (int i = 0; i < 5; i++) { + uint8_t* data = workListToMemory(vertices[i]); + PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem), + 16, data, requestorId); + memPort.sendFunctional(pkt); + } + + for (int i = 0; i < 6; i++) { + uint8_t* data = edgeToMemory(edges[i]); + PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge), + 16, data, requestorId); + memPort.sendFunctional(pkt); + } + +} + bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt) { return owner->handleUpdate(pkt); @@ -104,7 +138,8 @@ void PushEngine::processNextReceiveEvent() std::vector num_edge_queue; for (uint32_t index = 0; index < degree; index++) { - Addr edge_addr = (edge_index + index) * sizeof(Edge); + // FIXME: For now the base edge address is 1048576 + Addr edge_addr = 1048576 + (edge_index + index) * sizeof(Edge); Addr req_addr = (edge_addr / 64) * 64; Addr req_offset = edge_addr % 64; if (addr_queue.size()) { diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh index a746dcc265..077c61aa2b 100644 --- a/src/accl/push_engine.hh +++ b/src/accl/push_engine.hh @@ -39,6 +39,7 @@ #include "sim/clocked_object.hh" #include "sim/system.hh" +//FIXME: Add gem5 namespace here class PushEngine : public ClockedObject { private: @@ -89,6 +90,8 @@ class PushEngine : public ClockedObject virtual bool recvTimingResp(PacketPtr pkt); } + virtual void startup() override; + System* const system; const RequestorID requestorId; From fb64f7d3e1c82b7a71b70a14215f8663c8908d65 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 14 Feb 2022 12:26:30 -0800 Subject: [PATCH 015/247] Bug fixes. --- src/accl/SConscript | 8 ++--- src/accl/util.cc | 82 +++++++++++++++++++++++++-------------------- src/accl/util.hh | 7 ++-- 3 files changed, 55 insertions(+), 42 deletions(-) diff --git a/src/accl/SConscript b/src/accl/SConscript index da0774ca44..4b78ff9e80 100644 --- a/src/accl/SConscript +++ b/src/accl/SConscript @@ -28,9 +28,9 @@ Import('*') SimObject('WLEngine.py') -# SimObject('Apply.py') -# SimObject('PushEngine.py') +SimObject('Apply.py') +SimObject('PushEngine.py') -# Source('apply.cc') +Source('apply.cc') Source('wl_engine.cc') -# Source('push_engine.cc') +Source('push_engine.cc') diff --git a/src/accl/util.cc b/src/accl/util.cc index 92f6a3e351..b81ba4db7d 100644 --- a/src/accl/util.cc +++ b/src/accl/util.cc @@ -28,8 +28,39 @@ #include "accl/util.hh" -#include "base/types.hh" -#include "mem/packet.hh" +WorkListItem& +memoryToWorkList(uint8_t* data){ + WorkListItem wl; + uint32_t temp_prop = *((uint32_t*) data)); + + uint32_t prop = *((uint32_t*) (data + 4)); + + uint32_t degree = *((uint32_t*) (data + 8)); + + uint32_t addr = *((uint32_t*) (data + 12)); + + retrun wl = {temp_prop, prop, degree, addr}; +} + +uint8_t* +workListToMemory(WorkListItem wl){ + int data_size = sizeof(WorkListItem) / sizeof(uint8_t); + uint8_t* data = new uint8_t [data_size]; + + uint32_t* tempPtr = (uint32_t*) data; + *tempPtr = wl.temp_prop; + + uint32_t* propPtr = (uint32_t*) (data + 4); + *propPtr = wl.prop; + + uint32_t* degreePtr = (uint32_t*) (data + 8); + *degreePtr = wl.degree; + + uint32_t* edgePtr = (uint32_t*) (data + 12); + *edgePtr = wl.edgeIndex; + + return data; +} // Edge: (weight: 64 bits, neighbor: 64 bits) Edge& @@ -58,7 +89,7 @@ edgeToMemory(Edge e) return data; } -PacketPtr +PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId) { RequestPtr req = std::make_shared(addr, size, 0, requestorId); @@ -73,19 +104,24 @@ getReadPacket(Addr addr, unsigned int size, RequestorID requestorId) return pkt; } - -PacketPtr getWritePacket(Addr addr, - unsigned int size, - uint8_t* data, - RequestorID requestorId) +PacketPtr& +getWritePacket(Addr addr, unsigned int size, + uint8_t* data, RequestorID requestorId) { - equestPtr req = std::make_shared(addr, size, 0, + RequestPtr req = std::make_shared(addr, size, 0, requestorId); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits req->setPC(((Addr)requestorId) << 2); PacketPtr pkt = new Packet(req, MemCmd::WriteReq); + pkt->allocate(); + pkt->setData(data); + + return pkt; +} -PacketPtr +PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data) { RequestPtr req = std::make_shared(addr, size, 0, @@ -102,29 +138,3 @@ getUpdatePacket(Addr addr, unsigned int size, uint8_t *data) return pkt; } -WorkListItem& -memoryToWorkList(uint8_t* data){ - WorkListItem wl; - uint32_t temp_prop = *((uint32_t*) data)); - - uint32_t prop = *((uint32_t*) (data + 4)); - - uint32_t degree = *((uint32_t*) (data + 8)); - - uint32_t addr = *((uint32_t*) (data + 12)); - - retrun wl = {temp_prop, prop, degree, addr}; -} - -unit8_t* -workListToMemory(WorkListItem wl){ - int data_size = sizeof(WorkListItem)/sizeof(uint_8) - uint_8* data = new uint8_t [data_size]; - uint_32* wList = (uint_32*)data; - *wList = wl.prop; - *wList + 1 = wl.temp_prop; - *wList + 2 = wl.degree; - *wList + 3 = wl.edgeIndex; - - return data; -} diff --git a/src/accl/util.hh b/src/accl/util.hh index 737d52e2a1..da5a0736c9 100644 --- a/src/accl/util.hh +++ b/src/accl/util.hh @@ -26,7 +26,6 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "base/addr_range_map.hh" #include "base/types.hh" #include "mem/packet.hh" @@ -50,5 +49,9 @@ uint8_t* workListToMemory(WorkListItem wl); Edge& memoryToEdge(uint8_t* data); uint8_t* edgeToMemory(Edge e); -PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId); +PacketPtr& getReadPacket(Addr addr, unsigned int size, + RequestorID requestorId); +PacketPtr& +getWritePacket(Addr addr, unsigned int size, + uint8_t* data, RequestorID requestorId); PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data); From 9eeb01889c5813d1f60ddfacda5e4c4538460860 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 14 Feb 2022 13:05:49 -0800 Subject: [PATCH 016/247] More bug fixes. --- src/accl/SConscript | 5 +++-- src/accl/apply.cc | 5 +++++ src/accl/apply.hh | 5 +++++ src/accl/push_engine.cc | 6 +++++- src/accl/push_engine.hh | 5 ++++- src/accl/util.hh | 3 +-- src/accl/wl_engine.cc | 4 ++++ src/accl/wl_engine.hh | 11 ++++++++--- 8 files changed, 35 insertions(+), 9 deletions(-) diff --git a/src/accl/SConscript b/src/accl/SConscript index 4b78ff9e80..18ac71eb7d 100644 --- a/src/accl/SConscript +++ b/src/accl/SConscript @@ -27,10 +27,11 @@ Import('*') -SimObject('WLEngine.py') SimObject('Apply.py') SimObject('PushEngine.py') +SimObject('WLEngine.py') Source('apply.cc') -Source('wl_engine.cc') Source('push_engine.cc') +Source('wl_engine.cc') +Source('util.cc') diff --git a/src/accl/apply.cc b/src/accl/apply.cc index 985e6217d7..678f240bf6 100644 --- a/src/accl/apply.cc +++ b/src/accl/apply.cc @@ -30,6 +30,9 @@ #include +namespace gem5 +{ + Apply::Apply(const ApplyParams ¶ms): ClockedObject(params), system(params.system), @@ -225,3 +228,5 @@ Apply::processNextApplyEvent(){ schedule(nextApplyEvent, nextCycle()); } } + +} diff --git a/src/accl/apply.hh b/src/accl/apply.hh index f4dabd6a97..42cb310136 100644 --- a/src/accl/apply.hh +++ b/src/accl/apply.hh @@ -42,6 +42,9 @@ #include "sim/clocked_object.hh" #include "sim/port.hh" +namespace gem5 +{ + class Apply : public ClockedObject { private: @@ -177,4 +180,6 @@ class Apply : public ClockedObject PortID idx=InvalidPortID) override; }; +} + #endif // __ACCL_APPLY_HH__ diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc index f1f8f7698b..57fa560ff7 100644 --- a/src/accl/push_engine.cc +++ b/src/accl/push_engine.cc @@ -28,7 +28,9 @@ #include "accl/util.hh" #include "accl/push_engine.hh" -// #include "debug/PushEngine.hh" + +namespace gem5 +{ PushEngine::PushEngine(const PushEngineParams ¶ms) : ClockedObject(params), system(params.system), @@ -243,3 +245,5 @@ void PushEngine::processNextSendEvent() schedule(nextSendEvent, nextCycle()); } } + +} diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh index 077c61aa2b..cc129076a5 100644 --- a/src/accl/push_engine.hh +++ b/src/accl/push_engine.hh @@ -39,7 +39,9 @@ #include "sim/clocked_object.hh" #include "sim/system.hh" -//FIXME: Add gem5 namespace here +namespace gem5 +{ + class PushEngine : public ClockedObject { private: @@ -134,4 +136,5 @@ class PushEngine : public ClockedObject }; +} #endif // __ACCL_PUSH_ENGINE_HH__ diff --git a/src/accl/util.hh b/src/accl/util.hh index da5a0736c9..76d67ce6df 100644 --- a/src/accl/util.hh +++ b/src/accl/util.hh @@ -51,7 +51,6 @@ uint8_t* edgeToMemory(Edge e); PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId); -PacketPtr& -getWritePacket(Addr addr, unsigned int size, +PacketPtr& getWritePacket(Addr addr, unsigned int size, uint8_t* data, RequestorID requestorId); PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data); diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc index 757bdd2598..00371e56cc 100644 --- a/src/accl/wl_engine.cc +++ b/src/accl/wl_engine.cc @@ -30,6 +30,8 @@ #include +namespace gem5 +{ WLEngine::WLEngine(const WLEngineParams ¶ms): ClockedObject(params), @@ -242,3 +244,5 @@ WLEngine::processNextWLReduceEvent(){ schedule(nextWLReduceEvent, nextCycle()); } } + +} diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh index 0393cd4cb5..8c69bba7f7 100644 --- a/src/accl/wl_engine.hh +++ b/src/accl/wl_engine.hh @@ -41,6 +41,9 @@ #include "sim/clocked_object.hh" #include "sim/port.hh" +namespace gem5 +{ + class WLEngine : public ClockedObject { private: @@ -117,7 +120,7 @@ class WLEngine : public ClockedObject public: WLMemPort(const std::string& name, SimObject* _owner, PortID id=InvalidPortID); - void sendPacket(PacktPtr pkt); + void sendPacket(PacketPtr pkt); void trySendRetry(); bool blocked(){ return _blocked; @@ -132,7 +135,7 @@ class WLEngine : public ClockedObject bool sendPacket(); //one queue for write and one for read a priotizes write over read void readWLBuffer(); - bool handleMemResp(PacktPtr resp); + bool handleMemResp(PacketPtr resp); //Events @@ -174,4 +177,6 @@ class WLEngine : public ClockedObject PortID idx=InvalidPortID) override; }; -#endif // __ACCL_WLE_HH__ \ No newline at end of file +} + +#endif // __ACCL_WLE_HH__ From 6efe411a7a16cca5b80ce4fdecba591c1f9de67a Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 14 Feb 2022 14:18:14 -0800 Subject: [PATCH 017/247] Even more bug fixes. --- src/accl/push_engine.cc | 28 +++++++++++++++++++++++----- src/accl/push_engine.hh | 35 +++++++++++++++++++++-------------- src/accl/util.cc | 24 ++++++++++++++---------- src/accl/util.hh | 18 ++++++++++++------ 4 files changed, 70 insertions(+), 35 deletions(-) diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc index 57fa560ff7..56a57e76ac 100644 --- a/src/accl/push_engine.cc +++ b/src/accl/push_engine.cc @@ -65,6 +65,8 @@ PushEngine::getPort(const std::string &if_name, PortID idx) void PushEngine::startup() { + //FIXME: This is the current version of our initializer. + // This should be updated in the future. WorkListItem vertices [5] = { {0, 0, 3, 0}, // Addr: 0 {0, 0, 1, 3}, // Addr: 16 @@ -109,6 +111,7 @@ PushEngine::PushRespPort::getAddrRanges() bool PushEngine::handleUpdate(PacketPtr pkt) { + //FIXME: There should be a check if the queues are full. // if (vertexQueueLen < vertexQueueSize) { // vertexQueue.push(pkt) // vertexQueueLen++; @@ -192,20 +195,19 @@ bool PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt) return owner->handleMemResp(pkt); } -void PushEngine::PushMemPort::sendPacket(PacketPtr pkt) +void +PushEngine::PushMemPort::sendPacket(PacketPtr pkt) { panic_if(_blocked, "Should never try to send if blocked MemSide!"); // If we can't send the packet across the port, store it for later. if (!sendTimingReq(pkt)) { blockedPacket = pkt; - DPRINTF(MemScheduler, "Setting blocked to true on port %s\n", - this->name()); _blocked = true; } } -void PushEngine::handleMemResp(PacketPtr pkt) +bool PushEngine::handleMemResp(PacketPtr pkt) { RequestPtr req = pkt->req; uint8_t *data = pkt->getPtr(); @@ -230,8 +232,12 @@ void PushEngine::handleMemResp(PacketPtr pkt) if (!nextSendEvent.scheduled() && !updateQueue.empty()) { schedule(nextSendEvent, nextCycle()); } -} + //TODO: Should we always return true? It's the response from the memory + // so maybe yes. We assume the receiving bandwidth of the PushEngine is + // higher than its demand bandwidth + return true; +} void PushEngine::processNextSendEvent() { @@ -246,4 +252,16 @@ void PushEngine::processNextSendEvent() } } +void +PushEngine::PushReqPort::sendPacket(PacketPtr pkt) +{ + panic_if(_blocked, "Should never try to send if blocked MemSide!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + blockedPacket = pkt; + _blocked = true; + } +} + } diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh index cc129076a5..7b5f483431 100644 --- a/src/accl/push_engine.hh +++ b/src/accl/push_engine.hh @@ -54,10 +54,10 @@ class PushEngine : public ClockedObject PacketPtr blockedPacket; public: - //TODO: Implement this; - PushRespPort(const std::string& name, SimObject* _owner, - PortID id=InvalidPortID); - + PushRespPort(const std::string& name, PushEngine* owner): + ResponsePort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} virtual AddrRangeList getAddrRanges(); virtual bool recvTimingReq(PacketPtr pkt); } @@ -65,27 +65,32 @@ class PushEngine : public ClockedObject class PushReqPort : public RequestPort { private: + PushEngine* owner; bool _blocked; PacketPtr blockedPacket; public: - // TODO: Implement this; - PushReqPort(const std::string& name, SimObject* _owner, - PortID id=InvalidPortID); - + PushReqPort(const std::string& name, PushEngine* owner): + RequestPort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} + void sendPacket(PacketPtr pkt); + bool blocked() { return _blocked; } virtual bool recvTimingResp(PacketPtr pkt); } class PushMemPort : public RequestPort { private: + PushEngine* owner bool _blocked; PacketPtr blockedPacket; public: - // TODO: Implement this; - PushMemPort(const std::string& name, SimObject* _owner, - PortID id=InvalidPortID); + PushMemPort(const std::string& name, PushEngine* owner): + RequestPort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} void sendPacket(PacktPtr pkt); bool blocked() { return _blocked; } @@ -106,9 +111,9 @@ class PushEngine : public ClockedObject // int vertexQueueSize; // int vertexQueueLen; - std::unordered_map reqOffsetMap; - std::unordered_map reqNumEdgeMap; - std::unordered_map reqValueMap; + std::unordered_map reqOffsetMap; + std::unordered_map reqNumEdgeMap; + std::unordered_map reqValueMap; std::queue memReqQueue; // Infinite queueing? @@ -127,6 +132,8 @@ class PushEngine : public ClockedObject bool handleUpdate(PacketPtr pkt); + bool handleMemResp(PacketPtr pkt); + public: PushEngine(const PushEngineParams ¶ms); diff --git a/src/accl/util.cc b/src/accl/util.cc index b81ba4db7d..40a1fc761b 100644 --- a/src/accl/util.cc +++ b/src/accl/util.cc @@ -28,18 +28,20 @@ #include "accl/util.hh" -WorkListItem& +namespace gem5 +{ + +WorkListItem memoryToWorkList(uint8_t* data){ WorkListItem wl; - uint32_t temp_prop = *((uint32_t*) data)); + uint32_t temp_prop = *((uint32_t*) data); uint32_t prop = *((uint32_t*) (data + 4)); - uint32_t degree = *((uint32_t*) (data + 8)); - uint32_t addr = *((uint32_t*) (data + 12)); - retrun wl = {temp_prop, prop, degree, addr}; + wl = {temp_prop, prop, degree, addr}; + return wl; } uint8_t* @@ -63,7 +65,7 @@ workListToMemory(WorkListItem wl){ } // Edge: (weight: 64 bits, neighbor: 64 bits) -Edge& +Edge memoryToEdge(uint8_t *data) { uint64_t weight = *((uint64_t*) data); @@ -89,7 +91,7 @@ edgeToMemory(Edge e) return data; } -PacketPtr& +PacketPtr getReadPacket(Addr addr, unsigned int size, RequestorID requestorId) { RequestPtr req = std::make_shared(addr, size, 0, requestorId); @@ -104,7 +106,7 @@ getReadPacket(Addr addr, unsigned int size, RequestorID requestorId) return pkt; } -PacketPtr& +PacketPtr getWritePacket(Addr addr, unsigned int size, uint8_t* data, RequestorID requestorId) { @@ -121,8 +123,9 @@ getWritePacket(Addr addr, unsigned int size, return pkt; } -PacketPtr& -getUpdatePacket(Addr addr, unsigned int size, uint8_t *data) +PacketPtr +getUpdatePacket(Addr addr, unsigned int size, + uint8_t *data, RequestorID requestorId) { RequestPtr req = std::make_shared(addr, size, 0, requestorId); @@ -138,3 +141,4 @@ getUpdatePacket(Addr addr, unsigned int size, uint8_t *data) return pkt; } +} diff --git a/src/accl/util.hh b/src/accl/util.hh index 76d67ce6df..91692488a4 100644 --- a/src/accl/util.hh +++ b/src/accl/util.hh @@ -28,6 +28,10 @@ #include "base/types.hh" #include "mem/packet.hh" +#include "mem/request.hh" + +namespace gem5 +{ struct WorkListItem { @@ -41,16 +45,18 @@ struct Edge { uint64_t weight; Addr neighbor; -} +}; -WorkListItem& memoryToWorkList(uint8_t* data); +WorkListItem memoryToWorkList(uint8_t* data); uint8_t* workListToMemory(WorkListItem wl); -Edge& memoryToEdge(uint8_t* data); +Edge memoryToEdge(uint8_t* data); uint8_t* edgeToMemory(Edge e); -PacketPtr& getReadPacket(Addr addr, unsigned int size, +PacketPtr getReadPacket(Addr addr, unsigned int size, RequestorID requestorId); -PacketPtr& getWritePacket(Addr addr, unsigned int size, +PacketPtr getWritePacket(Addr addr, unsigned int size, uint8_t* data, RequestorID requestorId); -PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data); +PacketPtr getUpdatePacket(Addr addr, unsigned int size, uint8_t *data); + +} From fcdcceb33d9d2dc054f8ad021c0e39c8e4bff21e Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 14 Feb 2022 15:46:21 -0800 Subject: [PATCH 018/247] Bug fixes, bug fixes everywhere. --- src/accl/apply.cc | 12 ++++---- src/accl/apply.hh | 61 ++++++++++++++++++++--------------------- src/accl/push_engine.cc | 8 +++++- src/accl/push_engine.hh | 17 ++++++------ src/accl/wl_engine.hh | 17 +++++------- 5 files changed, 60 insertions(+), 55 deletions(-) diff --git a/src/accl/apply.cc b/src/accl/apply.cc index 678f240bf6..c44738d3fa 100644 --- a/src/accl/apply.cc +++ b/src/accl/apply.cc @@ -62,14 +62,14 @@ Apply::getPort(const std::string &if_name, PortID idx) } AddrRangeList -Apply::ApplyRespPort::getAddrRanges() const +Apply::ApplyRespPort::getAddrRanges() { return owner->getAddrRanges(); } bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt) { - if (!this->handleWL(pkt)){ + if (!owner->handleWL(pkt)){ return false; } return true; @@ -82,15 +82,17 @@ Apply::ApplyRespPort::trySendRetry() } -virtual bool +bool Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt) { - return this->handleMemResp(pkt); + return owner->handleMemResp(pkt); } void -WLEngine::ApplyMemPort::sendPacket(PacketPtr pkt) +Apply::ApplyMemPort::sendPacket(PacketPtr pkt) { + panic_if(_blocked, "Should never try to send if blocked MemSide!"); + if (!sendTimingReq(pkt)) { blockedPacket = pkt; _blocked = true; diff --git a/src/accl/apply.hh b/src/accl/apply.hh index 42cb310136..788550646a 100644 --- a/src/accl/apply.hh +++ b/src/accl/apply.hh @@ -33,14 +33,13 @@ #include #include "accl/util.hh" -#include "base/addr_range_map.hh" -#include "base/statistics.hh" -#include "base/types.hh" +#include "base/addr_range.hh" #include "mem/packet.hh" #include "mem/port.hh" #include "params/Apply.hh" #include "sim/clocked_object.hh" #include "sim/port.hh" +#include "sim/system.hh" namespace gem5 { @@ -58,18 +57,18 @@ class Apply : public ClockedObject return (applyQueue.size() == queueSize); } bool empty(){ - return applyQueue->empty(); + return applyQueue.empty(); } void push(PacketPtr pkt){ - applyQueue->push(pkt); + applyQueue.push(pkt); } void pop(){ - applyQueue->pop(); + applyQueue.pop(); } void front(){ - applyQueue->front(); + applyQueue.front(); } ApplyQueue(uint32_t qSize): @@ -80,16 +79,17 @@ class Apply : public ClockedObject { private: Apply *owner; + bool _blocked; PacketPtr blockedPacket; public: - ApplyRespPort(const std::string& name, SimObject* _owner, - PortID id=InvalidPortID); + ApplyRespPort(const std::string& name, Apply* owner): + ResponsePort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} - virtual AddrRangeList getAddrRanges(); void trySendRetry(); - - protected: + virtual AddrRangeList getAddrRanges(); virtual bool recvTimingReq(PacketPtr pkt); }; @@ -101,12 +101,13 @@ class Apply : public ClockedObject PacketPtr blockedPacket; public: - ApplyReqPort(const std::string& name, SimObject* _owner, - PortID id=InvalidPortID); + ApplyReqPort(const std::string& name, Apply* owner): + RequestPort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} + void sendPacket(PacketPtr pkt); - bool blocked(){ - return _blocked; - } + bool blocked() { return _blocked; } protected: void recvReqRetry() override; @@ -121,13 +122,14 @@ class Apply : public ClockedObject PacketPtr blockedPacket; public: - ApplyReqPort(const std::string& name, SimObject* _owner, - PortID id=InvalidPortID); + ApplyMemPort(const std::string& name, Apply* owner): + RequestPort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} + void sendPacket(PacketPtr pkt); void trySendRetry(); - bool blocked(){ - return _blocked; - } + bool blocked(){ return _blocked;} protected: virtual bool recvTimingResp(PacketPtr pkt); @@ -138,28 +140,24 @@ class Apply : public ClockedObject bool sendPacket(); //one queue for write and one for read a priotizes write over read void readApplyBuffer(); - bool handleMemResp(PacktPtr resp); + bool handleMemResp(PacketPtr resp); void writePushBuffer(); //Events void processNextApplyCheckEvent(); + EventFunctionWrapper nextApplyCheckEvent; /* Syncronously checked If there are any active vertecies: create memory read packets + MPU::MPU::MemPortsendTimingReq */ void processNextApplyEvent(); + EventFunctionWrapper nextApplyEvent; /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp Perform apply and send the write request and read edgeList read + write Write edgelist loc in buffer */ - void processNextApplyEvent(); - EventFunctionWrapper nextApplyEvent; - - void processNextApplyCheckEvent(); - EventFunctionWrapper nextApplyCheckEvent; - System* const system; const RequestorID requestorId; @@ -170,13 +168,14 @@ class Apply : public ClockedObject ApplyMemPort memPort; ApplyRespPort respPort; - ApplyRequestPort reqPort; + ApplyReqPort reqPort; std::unordered_map requestOffset; public: Apply(const ApplyParams &apply); - Port &getPort(const std::string &if_name, + + Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; }; diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc index 56a57e76ac..48f1115042 100644 --- a/src/accl/push_engine.cc +++ b/src/accl/push_engine.cc @@ -106,7 +106,7 @@ bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt) AddrRangeList PushEngine::PushRespPort::getAddrRanges() { - owner->memPort->getAddrRanges(); + owner->getAddrRanges(); } bool PushEngine::handleUpdate(PacketPtr pkt) @@ -264,4 +264,10 @@ PushEngine::PushReqPort::sendPacket(PacketPtr pkt) } } +AddrRangeList +PushEngine::getAddrRanges() +{ + return memPort.getAddrRanges(); +} + } diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh index 7b5f483431..d478d14df0 100644 --- a/src/accl/push_engine.hh +++ b/src/accl/push_engine.hh @@ -31,8 +31,7 @@ #include -#include "base/addr_range_map.hh" -#include "base/statistics.hh" +#include "base/addr_range.hh" #include "mem/port.hh" #include "mem/packet.hh" #include "params/PushEngine.hh" @@ -60,7 +59,7 @@ class PushEngine : public ClockedObject {} virtual AddrRangeList getAddrRanges(); virtual bool recvTimingReq(PacketPtr pkt); - } + }; class PushReqPort : public RequestPort { @@ -77,12 +76,12 @@ class PushEngine : public ClockedObject void sendPacket(PacketPtr pkt); bool blocked() { return _blocked; } virtual bool recvTimingResp(PacketPtr pkt); - } + }; class PushMemPort : public RequestPort { private: - PushEngine* owner + PushEngine* owner; bool _blocked; PacketPtr blockedPacket; @@ -92,10 +91,10 @@ class PushEngine : public ClockedObject _blocked(false), blockedPacket(nullptr) {} - void sendPacket(PacktPtr pkt); + void sendPacket(PacketPtr pkt); bool blocked() { return _blocked; } virtual bool recvTimingResp(PacketPtr pkt); - } + }; virtual void startup() override; @@ -134,11 +133,13 @@ class PushEngine : public ClockedObject bool handleMemResp(PacketPtr pkt); + AddrRangeList getAddrRanges(); + public: PushEngine(const PushEngineParams ¶ms); - Port &getPort(const std::string &if_name, + Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; }; diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh index 8c69bba7f7..6f875adfed 100644 --- a/src/accl/wl_engine.hh +++ b/src/accl/wl_engine.hh @@ -33,13 +33,13 @@ #include #include "accl/util.hh" -#include "base/addr_range_map.hh" -#include "base/statistics.hh" +#include "base/addr_range.hh" #include "mem/port.hh" #include "mem/packet.hh" #include "params/WLEngine.hh" #include "sim/clocked_object.hh" #include "sim/port.hh" +#include "sim/system.hh" namespace gem5 { @@ -140,40 +140,37 @@ class WLEngine : public ClockedObject //Events void processNextWLReadEvent(); + EventFunctionWrapper nextWLReadEvent; /* Syncronously checked If there are any active vertecies: create memory read packets + MPU::MPU::MemPortsendTimingReq */ void processNextWLReduceEvent(); + EventFunctionWrapper nextWLReduceEvent; /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp Perform apply and send the write request and read edgeList read + write Write edgelist loc in buffer */ - void processNextWLReadEvent(); - EventFunctionWrapper nextWLReadEvent; - - void processNextWLReduceEvent(); - EventFunctionWrapper nextWLReduceEvent; System* const system; const RequestorID requestorId; + std::unordered_map requestOffset; AddrRangeList getAddrRanges() const; WLQueue updateQueue; WLQueue responseQueue; - WLMemPort memPort; WLMemPort memPort; WLRespPort respPort; - WLRequestPort reqPort; + WLReqPort reqPort; public: WLEngine(const WLEngineParams ¶ms); - Port &getPort(const std::string &if_name, + Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; }; From 750510f593e59e776bbfb2906a8b8e082669aa36 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Mon, 14 Feb 2022 21:33:53 -0800 Subject: [PATCH 019/247] arch: Debugging worklist engine [wip] Adding some missing virtual functions. Change-Id: I26f6c7d789f4b295bac3bc9b2a80f2cadb45b96f --- src/accl/wl_engine.cc | 26 +++++++++++++++++++++++++- src/accl/wl_engine.hh | 4 ++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc index 00371e56cc..7515e10167 100644 --- a/src/accl/wl_engine.cc +++ b/src/accl/wl_engine.cc @@ -81,6 +81,24 @@ WLEngine::WLRespPort::trySendRetry() sendRetryReq(); } +virtual void +WLEngine::WLRespPort::recvFunctional(PacketPtr pkt) +{ + owner->recvFunctional(pkt); +} + +virtual Tick +WLEngine::WLRespPort::recvAtomic(PacketPtr pkt) +{ + panic("recvAtomic unimpl."); +} + +virtual void +WLEngine::WLRespPort::recvRespRetry() +{ + panic("recvRespRetry from response port is called."); +} + void WLEngine::WLMemPort::sendPacket(PacketPtr pkt) { @@ -137,6 +155,12 @@ WLEngine::getAddrRanges() const return memPort.getAddrRanges(); } +void +WLEngine::recvFunctional(PacketPtr pkt) +{ + memPort.recvFunctional(pkt); +} + bool WLEngine::handleWLUpdate(PacketPtr pkt){ auto queue = updateQueue; if (queue.blocked()){ @@ -164,7 +188,7 @@ void WLEngine::processNextWLReadEvent(){ PacketPtr memPkt = new Packet(req, MemCmd::ReadReq); requestOffset[request] = req_offset; if (!memPort.blocked()){ - queue.pop() + queue.pop(); memPort.sendPacket(memPkt); break; } diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh index 6f875adfed..d2b96db203 100644 --- a/src/accl/wl_engine.hh +++ b/src/accl/wl_engine.hh @@ -88,6 +88,9 @@ class WLEngine : public ClockedObject protected: virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt); + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRespRetry(); }; class WLReqPort : public RequestPort //To Apply Engine @@ -159,6 +162,7 @@ class WLEngine : public ClockedObject std::unordered_map requestOffset; AddrRangeList getAddrRanges() const; + void recvFunctional(PacketPtr pkt); WLQueue updateQueue; WLQueue responseQueue; From 79429d177df5baef0d3cd4fc33a4db249d66db37 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 14 Feb 2022 21:38:05 -0800 Subject: [PATCH 020/247] Bug fix. --- src/accl/Apply.py | 1 + src/accl/apply.cc | 6 ++--- src/accl/push_engine.cc | 50 ++++++++++++++++++++++++++++++++--------- src/accl/push_engine.hh | 3 +++ 4 files changed, 46 insertions(+), 14 deletions(-) diff --git a/src/accl/Apply.py b/src/accl/Apply.py index 01c627d4c8..58639e880a 100644 --- a/src/accl/Apply.py +++ b/src/accl/Apply.py @@ -34,6 +34,7 @@ class Apply(ClockedObject): cxx_header = "accl/apply.hh" cxx_class = 'gem5::Apply' + system = Param.System(Parent.any, "The system object this apply engine is a part of") respPort = ResponsePort("Receives requests from WorkList") reqPort = RequestPort("Sends requests to Push") memPort = RequestPort("Memory side port, sends requests") diff --git a/src/accl/apply.cc b/src/accl/apply.cc index c44738d3fa..70bc8031c9 100644 --- a/src/accl/apply.cc +++ b/src/accl/apply.cc @@ -40,8 +40,8 @@ Apply::Apply(const ApplyParams ¶ms): reqPort(name() + ".reqPort", this), respPort(name() + ".respPort", this), memPort(name() + ".memPort", this), - nextApplyEvent([this]{processNextApplyEvent; }, name()), - nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name()) + nextApplyEvent([this]{ processNextApplyEvent(); }, name()), + nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()) { applyReadQueue(params.applyQueueSize); applyWriteQueue(params.applyQueueSize); @@ -172,7 +172,7 @@ void Apply::processNextApplyCheckEvent(){ } bool -Apply::handleMemResp(PacktPtr pkt) +Apply::handleMemResp(PacketPtr pkt) { auto queue = applyWriteQueue; diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc index 48f1115042..6ebe34ebd3 100644 --- a/src/accl/push_engine.cc +++ b/src/accl/push_engine.cc @@ -106,7 +106,7 @@ bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt) AddrRangeList PushEngine::PushRespPort::getAddrRanges() { - owner->getAddrRanges(); + return owner->getAddrRanges(); } bool PushEngine::handleUpdate(PacketPtr pkt) @@ -121,7 +121,7 @@ bool PushEngine::handleUpdate(PacketPtr pkt) // return true; // } // return false; - vertexQueue.push(pkt) + vertexQueue.push(pkt); if (!nextReceiveEvent.scheduled()) { schedule(nextReceiveEvent, nextCycle()); } @@ -130,8 +130,8 @@ bool PushEngine::handleUpdate(PacketPtr pkt) void PushEngine::processNextReceiveEvent() { - PacketPtr updatePkt = vertexQueue.pop(); - uint8_t *data = updatePkt->getData(); + PacketPtr updatePkt = vertexQueue.front(); + uint8_t *data = updatePkt->getPtr(); // data: (edge_index: 32 bits, degree: 32 bits, value: 32 bits) uint32_t edge_index = *((uint32_t *)data); @@ -152,19 +152,19 @@ void PushEngine::processNextReceiveEvent() num_edge_queue.back()++; } else { - addr_queue.push(req_addr); - offset_queue.push(req_offset); - num_edge_queue.push(1); + addr_queue.push_back(req_addr); + offset_queue.push_back(req_offset); + num_edge_queue.push_back(1); } } else { - addr_queue.push(req_addr); - offset_queue.push(req_offset); - num_edge_queue.push(1); + addr_queue.push_back(req_addr); + offset_queue.push_back(req_offset); + num_edge_queue.push_back(1); } } - for (int index = 0; index < addr_queue.size(); inedx++) { + for (int index = 0; index < addr_queue.size(); index++) { PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId); memReqQueue.push(pkt); reqOffsetMap[pkt->req] = offset_queue[index]; @@ -172,6 +172,8 @@ void PushEngine::processNextReceiveEvent() reqValueMap[pkt->req] = value; } + vertexQueue.pop(); + if (!nextReadEvent.scheduled() && !memReqQueue.empty()) { schedule(nextReadEvent, nextCycle()); } @@ -264,10 +266,36 @@ PushEngine::PushReqPort::sendPacket(PacketPtr pkt) } } +void +PushEngine::PushReqPort::recvReqRetry() +{ + panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); + + _blocked = false; + sendPacket(blockedPacket); + + if (!blocked()) { + blockedPacket = nullptr; + } +} + AddrRangeList PushEngine::getAddrRanges() { return memPort.getAddrRanges(); } +void +PushEngine::PushMemPort::recvReqRetry() +{ + panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); + + _blocked = false; + sendPacket(blockedPacket); + + if (!blocked()) { + blockedPacket = nullptr; + } +} + } diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh index d478d14df0..0acedd0da8 100644 --- a/src/accl/push_engine.hh +++ b/src/accl/push_engine.hh @@ -59,6 +59,7 @@ class PushEngine : public ClockedObject {} virtual AddrRangeList getAddrRanges(); virtual bool recvTimingReq(PacketPtr pkt); + }; class PushReqPort : public RequestPort @@ -76,6 +77,7 @@ class PushEngine : public ClockedObject void sendPacket(PacketPtr pkt); bool blocked() { return _blocked; } virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); }; class PushMemPort : public RequestPort @@ -94,6 +96,7 @@ class PushEngine : public ClockedObject void sendPacket(PacketPtr pkt); bool blocked() { return _blocked; } virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); }; virtual void startup() override; From 228fcf05f87be11a23ee5cfb8dec41d5b8dbcedd Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 14 Feb 2022 21:46:27 -0800 Subject: [PATCH 021/247] Bug fix. --- src/accl/Apply.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/accl/Apply.py b/src/accl/Apply.py index 58639e880a..d6a4bbe5a9 100644 --- a/src/accl/Apply.py +++ b/src/accl/Apply.py @@ -26,7 +26,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from m5.params import * -from m5.SimObject import SimObject +from m5.proxy import * from m5.objects.ClockedObject import ClockedObject class Apply(ClockedObject): From 709a21552623e2f112730512a1652d0436ccce03 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 14 Feb 2022 21:47:36 -0800 Subject: [PATCH 022/247] Fixing a bug-fix. --- src/accl/apply.hh | 1 + 1 file changed, 1 insertion(+) diff --git a/src/accl/apply.hh b/src/accl/apply.hh index 788550646a..e1b6d33359 100644 --- a/src/accl/apply.hh +++ b/src/accl/apply.hh @@ -88,6 +88,7 @@ class Apply : public ClockedObject _blocked(false), blockedPacket(nullptr) {} + protected: void trySendRetry(); virtual AddrRangeList getAddrRanges(); virtual bool recvTimingReq(PacketPtr pkt); From c1dd68a3e06a498b89cbb043f4779865ecad91b3 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Tue, 15 Feb 2022 00:13:21 -0800 Subject: [PATCH 023/247] fixing some bugs --- src/accl/Apply.py | 1 + src/accl/WLEngine.py | 4 +++- src/accl/apply.cc | 31 ++++++++++++++++++++---- src/accl/apply.hh | 23 ++++++++++-------- src/accl/wl_engine.cc | 48 +++++++++++++++++++------------------ src/accl/wl_engine.hh | 55 ++++++++++++++++++++++++------------------- 6 files changed, 99 insertions(+), 63 deletions(-) diff --git a/src/accl/Apply.py b/src/accl/Apply.py index d6a4bbe5a9..8720287cc8 100644 --- a/src/accl/Apply.py +++ b/src/accl/Apply.py @@ -38,3 +38,4 @@ class Apply(ClockedObject): respPort = ResponsePort("Receives requests from WorkList") reqPort = RequestPort("Sends requests to Push") memPort = RequestPort("Memory side port, sends requests") + applyQueueSize = Param.Unsigned(32, "Size of write queue") diff --git a/src/accl/WLEngine.py b/src/accl/WLEngine.py index fe6b25b6ba..562fd04423 100644 --- a/src/accl/WLEngine.py +++ b/src/accl/WLEngine.py @@ -26,7 +26,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. from m5.params import * -from m5.SimObject import SimObject +from m5.proxy import * from m5.objects.ClockedObject import ClockedObject class WLEngine(ClockedObject): @@ -34,6 +34,8 @@ class WLEngine(ClockedObject): cxx_header = "accl/wl_engine.hh" cxx_class = 'gem5::WLEngine' + system = Param.System(Parent.any, "The system object this push WorkList is a part of") respPort = ResponsePort("Receives updates") reqPort = RequestPort("Sends requests to Apply") memPort = RequestPort("Memory side port, sends requests") + wlQueueSize = Param.Unsigned(32, "Size of write queue") diff --git a/src/accl/apply.cc b/src/accl/apply.cc index 70bc8031c9..410eff5268 100644 --- a/src/accl/apply.cc +++ b/src/accl/apply.cc @@ -81,6 +81,23 @@ Apply::ApplyRespPort::trySendRetry() sendRetryReq(); } +void +Apply::ApplyRespPort::recvFunctional(PacketPtr pkt) +{ + panic("Not implemented"); +} + +Tick +Apply::ApplyRespPort::recvAtomic(PacketPtr pkt) +{ + panic("recvAtomic unimpl."); +} + +void +Apply::ApplyRespPort::recvRespRetry() +{ + panic("recvRespRetry from response port is called."); +} bool Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt) @@ -139,7 +156,7 @@ Apply::getAddrRanges() const bool Apply::handleWL(PacketPtr pkt){ auto queue = applyReadQueue; if (queue.blocked()){ - sendPktRetry = true; + queue.sendPktRetry = true; return false; } else{ queue.push(pkt); @@ -177,7 +194,7 @@ Apply::handleMemResp(PacketPtr pkt) auto queue = applyWriteQueue; if (queue.blocked()){ - sendPktRetry = true; + queue.sendPktRetry = true; return false; } else queue.push(pkt); @@ -192,7 +209,7 @@ Apply::handleMemResp(PacketPtr pkt) void Apply::processNextApplyEvent(){ auto queue = applyWriteQueue; - auto pkt = queue.front(); + PacketPtr pkt = queue.front(); uint8_t* data = pkt->getPtr(); RequestPtr request = pkt->req; @@ -204,7 +221,11 @@ Apply::processNextApplyEvent(){ if (temp_prop != prop){ if (!memPort.blocked() && !reqPort.blocked()){ //update prop with temp_prop - wl.prop = std::min(prop , temp_prop); + if(prop < temp_prop){ + wl.prop = prop; + }else{ + wl.prop = temp_prop; + } //write back the new worklist item to memory uint8_t* wList = workListToMemory(wl); memcpy(data + request_offset, wList, sizeof(WorkListItem)); @@ -212,7 +233,7 @@ Apply::processNextApplyEvent(){ PacketPtr writePkt = getWritePacket(pkt->getAddr(), 64, data, requestorId); memPort.sendPacket(writePkt); - applyReqPort.sendPacket(writePkt); + reqPort.sendPacket(writePkt); queue.pop(); if (queue.sendPktRetry && !queue.blocked()){ memPort.trySendRetry(); diff --git a/src/accl/apply.hh b/src/accl/apply.hh index e1b6d33359..f08c1fef85 100644 --- a/src/accl/apply.hh +++ b/src/accl/apply.hh @@ -63,12 +63,12 @@ class Apply : public ClockedObject applyQueue.push(pkt); } - void pop(){ - applyQueue.pop(); + PacketPtr pop(){ + return applyQueue->pop(); } - void front(){ - applyQueue.front(); + PacketPtr front(){ + return applyQueue.front(); } ApplyQueue(uint32_t qSize): @@ -83,15 +83,18 @@ class Apply : public ClockedObject PacketPtr blockedPacket; public: + void trySendRetry(); + virtual AddrRangeList getAddrRanges(); ApplyRespPort(const std::string& name, Apply* owner): ResponsePort(name, owner), owner(owner), _blocked(false), blockedPacket(nullptr) {} protected: - void trySendRetry(); - virtual AddrRangeList getAddrRanges(); virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt); + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRespRetry(); }; class ApplyReqPort : public RequestPort @@ -137,6 +140,10 @@ class Apply : public ClockedObject void recvReqRetry() override; }; + ApplyMemPort memPort; + ApplyRespPort respPort; + ApplyReqPort reqPort; + bool handleWL(PacketPtr pkt); bool sendPacket(); //one queue for write and one for read a priotizes write over read @@ -167,10 +174,6 @@ class Apply : public ClockedObject ApplyQueue applyReadQueue; ApplyQueue applyWriteQueue; - ApplyMemPort memPort; - ApplyRespPort respPort; - ApplyReqPort reqPort; - std::unordered_map requestOffset; public: diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc index 7515e10167..9b16a15575 100644 --- a/src/accl/wl_engine.cc +++ b/src/accl/wl_engine.cc @@ -36,6 +36,7 @@ namespace gem5 WLEngine::WLEngine(const WLEngineParams ¶ms): ClockedObject(params), system(params.system), + queueSize(params.wlQueueSize), requestorId(system->getRequestorId(this)), reqPort(name() + ".reqPort", this), respPort(name() + ".respPort", this), @@ -43,8 +44,8 @@ WLEngine::WLEngine(const WLEngineParams ¶ms): nextWLReadEvent([this]{processNextWLReadEvent; }, name()), nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()) { - updateQueue(params.wlQueueSize); - responseQueue(params.wlQueueSize); + updateQueue.resize(queueSize); + responseQueue.resize(queueSize); } Port & @@ -69,7 +70,7 @@ WLEngine::WLRespPort::getAddrRanges() const bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt) { - if (!this->handleWLUpdate(pkt)){ + if (!owner->handleWLUpdate(pkt)){ return false; } return true; @@ -81,19 +82,19 @@ WLEngine::WLRespPort::trySendRetry() sendRetryReq(); } -virtual void +void WLEngine::WLRespPort::recvFunctional(PacketPtr pkt) { owner->recvFunctional(pkt); } -virtual Tick +Tick WLEngine::WLRespPort::recvAtomic(PacketPtr pkt) { panic("recvAtomic unimpl."); } -virtual void +void WLEngine::WLRespPort::recvRespRetry() { panic("recvRespRetry from response port is called."); @@ -118,10 +119,10 @@ WLEngine::WLMemPort::recvReqRetry() blockedPacket = nullptr; } -virtual bool +bool WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt) { - return this->handleMemResp(pkt); + return owner->handleMemResp(pkt); } void @@ -177,15 +178,14 @@ bool WLEngine::handleWLUpdate(PacketPtr pkt){ void WLEngine::processNextWLReadEvent(){ auto queue = updateQueue; - auto memPort = WLMemPort; while (!queue.empty()){ //create a map instead of front - auto pkt = queue.front() + PacketPtr pkt = queue.front(); /// conver to ReadReq Addr req_addr = (pkt->getAddr() / 64) * 64; int req_offset = (pkt->getAddr()) % 64; RequestPtr request = std::make_shared(req_addr, 64, 0 ,0); - PacketPtr memPkt = new Packet(req, MemCmd::ReadReq); + PacketPtr memPkt = new Packet(request, MemCmd::ReadReq); requestOffset[request] = req_offset; if (!memPort.blocked()){ queue.pop(); @@ -199,15 +199,15 @@ void WLEngine::processNextWLReadEvent(){ } bool -WLEngine::handleMemResp(PacktPtr pkt) +WLEngine::handleMemResp(PacketPtr pkt) { auto queue = responseQueue; if (queue.blocked()){ - sendPktRetry = true; + queue.sendPktRetry = true; return false; - } else - queue.push(writePkt); - + } else{ + queue.push(pkt); + } if(!nextWLReduceEvent.scheduled()){ schedule(nextWLReduceEvent, nextCycle()); } @@ -219,18 +219,20 @@ void WLEngine::processNextWLReduceEvent(){ auto queue = responseQueue; auto updateQ = updateQueue; - applyPort = reqPort; - auto update = updateQ.front(); - auto value = update->getPtr(); - auto pkt = queue.front(); + auto applyPort = reqPort; + PacketPtr update = updateQ.front(); + uint8_t* value = update->getPtr(); + PacketPtr pkt = queue.front(); uint8_t* data = pkt->getPtr(); RequestPtr request = pkt->req; int request_offset = requestOffset[request]; - WorkListItem wl = memoryToWorkList(data + request_offset) + WorkListItem wl = memoryToWorkList(data + request_offset); uint32_t temp_prop = wl.temp_prop; if (temp_prop != *value){ //update prop with temp_prop - temp_prop = std::min(value , temp_prop); + if(*value < temp_prop){ + temp_prop = *value; + } if (!memPort.blocked() && !applyPort.blocked()){ wl.temp_prop = temp_prop; uint8_t* wlItem = workListToMemory(wl); @@ -257,7 +259,7 @@ WLEngine::processNextWLReduceEvent(){ memPort.trySendRetry(); queue.sendPktRetry = false; } - updateQ.pop() + updateQ.pop(); if (!updateQ.blocked() & updateQ.sendPktRetry){ respPort.trySendRetry(); updateQ.sendPktRetry = false; diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh index d2b96db203..8d02c16981 100644 --- a/src/accl/wl_engine.hh +++ b/src/accl/wl_engine.hh @@ -50,27 +50,32 @@ class WLEngine : public ClockedObject struct WLQueue{ std::queue wlQueue; - const uint32_t queueSize; + uint32_t queueSize; bool sendPktRetry; + void resize(uint32_t size){ + queueSize = size; + } + bool blocked(){ return (wlQueue.size() == queueSize); } bool empty(){ - return wlQueue->empty(); + return wlQueue.empty(); } void push(PacketPtr pkt){ - wlQueue->push(pkt); + wlQueue.push(pkt); } void pop(){ - wlQueue->pop(); + wlQueue.pop(); } - void front(){ - wlQueue.front()); + PacketPtr front(){ + return wlQueue.front(); } WLQueue(uint32_t qSize): - queueSize(qSize){} + queueSize(qSize), + sendPktRetry(false){} }; class WLRespPort : public ResponsePort //From Push engine @@ -83,7 +88,7 @@ class WLEngine : public ClockedObject WLRespPort(const std::string& name, SimObject* _owner, PortID id=InvalidPortID); - virtual AddrRangeList getAddrRanges(); + virtual AddrRangeList getAddrRanges() const override; void trySendRetry(); protected: @@ -129,50 +134,52 @@ class WLEngine : public ClockedObject return _blocked; } - protected: - virtual bool recvTimingResp(PacketPtr pkt); - void recvReqRetry() override; + protected: + virtual bool recvTimingResp(PacketPtr pkt); + void recvReqRetry() override; }; + System* const system; + const uint32_t queueSize; + const RequestorID requestorId; + + WLReqPort reqPort; + WLRespPort respPort; + WLMemPort memPort; + bool handleWLU(PacketPtr pkt); bool sendPacket(); //one queue for write and one for read a priotizes write over read void readWLBuffer(); - bool handleMemResp(PacketPtr resp); //Events - void processNextWLReadEvent(); EventFunctionWrapper nextWLReadEvent; + void processNextWLReadEvent(); /* Syncronously checked If there are any active vertecies: create memory read packets + MPU::MPU::MemPortsendTimingReq */ - void processNextWLReduceEvent(); EventFunctionWrapper nextWLReduceEvent; + void processNextWLReduceEvent(); /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp Perform apply and send the write request and read edgeList read + write Write edgelist loc in buffer */ - System* const system; - const RequestorID requestorId; - std::unordered_map requestOffset; - AddrRangeList getAddrRanges() const; - void recvFunctional(PacketPtr pkt); - WLQueue updateQueue; WLQueue responseQueue; - WLMemPort memPort; - WLRespPort respPort; - WLReqPort reqPort; - public: + public: + AddrRangeList getAddrRanges() const; + bool handleWLUpdate(PacketPtr pkt); + bool handleMemResp(PacketPtr resp); + void recvFunctional(PacketPtr pkt); WLEngine(const WLEngineParams ¶ms); Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; From 90800d55dd30af7e3fb47173bad39c3adf11ccbd Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 15 Feb 2022 09:24:55 -0800 Subject: [PATCH 024/247] Bug fix. --- src/accl/push_engine.cc | 26 ++++++++++++++++++++------ src/accl/push_engine.hh | 13 ++++++++++++- src/accl/wl_engine.cc | 9 ++------- src/accl/wl_engine.hh | 3 +-- 4 files changed, 35 insertions(+), 16 deletions(-) diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc index 6ebe34ebd3..746ed8a142 100644 --- a/src/accl/push_engine.cc +++ b/src/accl/push_engine.cc @@ -98,18 +98,32 @@ PushEngine::startup() } -bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt) +AddrRangeList +PushEngine::PushRespPort::getAddrRanges() +{ + return owner->getAddrRanges(); +} + +bool +PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt) { return owner->handleUpdate(pkt); } -AddrRangeList -PushEngine::PushRespPort::getAddrRanges() +Tick +PushEngine::PushRespPort::recvAtomic(PacketPtr pkt) { - return owner->getAddrRanges(); + panic("recvAtomic unimpl."); +} + +void +PushEngine::PushRespPort::recvFunctional(PacketPtr pkt) +{ + owner->recvFunctional(pkt); } -bool PushEngine::handleUpdate(PacketPtr pkt) +bool +PushEngine::handleUpdate(PacketPtr pkt) { //FIXME: There should be a check if the queues are full. // if (vertexQueueLen < vertexQueueSize) { @@ -131,7 +145,7 @@ bool PushEngine::handleUpdate(PacketPtr pkt) void PushEngine::processNextReceiveEvent() { PacketPtr updatePkt = vertexQueue.front(); - uint8_t *data = updatePkt->getPtr(); + uint8_t* data = updatePkt->getPtr(); // data: (edge_index: 32 bits, degree: 32 bits, value: 32 bits) uint32_t edge_index = *((uint32_t *)data); diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh index 0acedd0da8..1aa70c7acb 100644 --- a/src/accl/push_engine.hh +++ b/src/accl/push_engine.hh @@ -58,8 +58,12 @@ class PushEngine : public ClockedObject _blocked(false), blockedPacket(nullptr) {} virtual AddrRangeList getAddrRanges(); - virtual bool recvTimingReq(PacketPtr pkt); + protected: + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt); + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRespRetry(); }; class PushReqPort : public RequestPort @@ -76,6 +80,8 @@ class PushEngine : public ClockedObject {} void sendPacket(PacketPtr pkt); bool blocked() { return _blocked; } + + protected: virtual bool recvTimingResp(PacketPtr pkt); virtual void recvReqRetry(); }; @@ -95,6 +101,8 @@ class PushEngine : public ClockedObject void sendPacket(PacketPtr pkt); bool blocked() { return _blocked; } + + protected: virtual bool recvTimingResp(PacketPtr pkt); virtual void recvReqRetry(); }; @@ -138,6 +146,8 @@ class PushEngine : public ClockedObject AddrRangeList getAddrRanges(); + void recvFunctional(PacketPtr pkt); + public: PushEngine(const PushEngineParams ¶ms); @@ -148,4 +158,5 @@ class PushEngine : public ClockedObject }; } + #endif // __ACCL_PUSH_ENGINE_HH__ diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc index 9b16a15575..bfabed33e9 100644 --- a/src/accl/wl_engine.cc +++ b/src/accl/wl_engine.cc @@ -76,12 +76,6 @@ bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt) return true; } -void -WLEngine::WLRespPort::trySendRetry() -{ - sendRetryReq(); -} - void WLEngine::WLRespPort::recvFunctional(PacketPtr pkt) { @@ -162,7 +156,8 @@ WLEngine::recvFunctional(PacketPtr pkt) memPort.recvFunctional(pkt); } -bool WLEngine::handleWLUpdate(PacketPtr pkt){ +bool +WLEngine::handleWLUpdate(PacketPtr pkt){ auto queue = updateQueue; if (queue.blocked()){ queue.sendPktRetry = true; diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh index 8d02c16981..ad53fd7e7e 100644 --- a/src/accl/wl_engine.hh +++ b/src/accl/wl_engine.hh @@ -88,8 +88,7 @@ class WLEngine : public ClockedObject WLRespPort(const std::string& name, SimObject* _owner, PortID id=InvalidPortID); - virtual AddrRangeList getAddrRanges() const override; - void trySendRetry(); + virtual AddrRangeList getAddrRanges(); protected: virtual bool recvTimingReq(PacketPtr pkt); From f62d592c1a5a1f7d397e025a6d9f8a8037a17e12 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 15 Feb 2022 09:36:23 -0800 Subject: [PATCH 025/247] Bug fix. --- src/accl/push_engine.cc | 24 ++++++++++++++++++------ src/accl/push_engine.hh | 13 +++++-------- src/accl/wl_engine.cc | 2 +- 3 files changed, 24 insertions(+), 15 deletions(-) diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc index 746ed8a142..bf385818f5 100644 --- a/src/accl/push_engine.cc +++ b/src/accl/push_engine.cc @@ -122,6 +122,24 @@ PushEngine::PushRespPort::recvFunctional(PacketPtr pkt) owner->recvFunctional(pkt); } +void +PushEngine::PushRespPort::recvRespRetry() +{ + panic("recvRespRetry from response port is called."); +} + +AddrRangeList +PushEngine::getAddrRanges() +{ + return memPort.getAddrRanges(); +} + +void +PushEngine::recvFunctional(PacketPtr pkt) +{ + memPort.sendFunctional(pkt); +} + bool PushEngine::handleUpdate(PacketPtr pkt) { @@ -293,12 +311,6 @@ PushEngine::PushReqPort::recvReqRetry() } } -AddrRangeList -PushEngine::getAddrRanges() -{ - return memPort.getAddrRanges(); -} - void PushEngine::PushMemPort::recvReqRetry() { diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh index 1aa70c7acb..269170c045 100644 --- a/src/accl/push_engine.hh +++ b/src/accl/push_engine.hh @@ -131,23 +131,20 @@ class PushEngine : public ClockedObject // int updateQueueSize; // int updateQueueLen; + AddrRangeList getAddrRanges(); + void recvFunctional(PacketPtr pkt); + + bool handleUpdate(PacketPtr pkt); EventFunctionWrapper nextReceiveEvent; void processNextReceiveEvent(); EventFunctionWrapper nextReadEvent; void processNextReadEvent(); + bool handleMemResp(PacketPtr pkt); EventFunctionWrapper nextSendEvent; void processNextSendEvent(); - bool handleUpdate(PacketPtr pkt); - - bool handleMemResp(PacketPtr pkt); - - AddrRangeList getAddrRanges(); - - void recvFunctional(PacketPtr pkt); - public: PushEngine(const PushEngineParams ¶ms); diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc index bfabed33e9..8365e754fc 100644 --- a/src/accl/wl_engine.cc +++ b/src/accl/wl_engine.cc @@ -153,7 +153,7 @@ WLEngine::getAddrRanges() const void WLEngine::recvFunctional(PacketPtr pkt) { - memPort.recvFunctional(pkt); + memPort.sendFunctional(pkt); } bool From e4cbf3493f1179d195209bc0aa007c7cda112506 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 15 Feb 2022 09:50:05 -0800 Subject: [PATCH 026/247] Bug fixes. --- src/accl/wl_engine.cc | 16 +++++++++++----- src/accl/wl_engine.hh | 6 +++--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc index 8365e754fc..872f38673e 100644 --- a/src/accl/wl_engine.cc +++ b/src/accl/wl_engine.cc @@ -41,11 +41,11 @@ WLEngine::WLEngine(const WLEngineParams ¶ms): reqPort(name() + ".reqPort", this), respPort(name() + ".respPort", this), memPort(name() + ".memPort", this), - nextWLReadEvent([this]{processNextWLReadEvent; }, name()), - nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()) + nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()), + nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name()), + updateQueue(queueSize), + responseQueue(queueSize) { - updateQueue.resize(queueSize); - responseQueue.resize(queueSize); } Port & @@ -88,6 +88,12 @@ WLEngine::WLRespPort::recvAtomic(PacketPtr pkt) panic("recvAtomic unimpl."); } +void +WLEngine::WLRespPort::recvFunctional(PacketPtr pkt) +{ + owner->recvFunctional(pkt); +} + void WLEngine::WLRespPort::recvRespRetry() { @@ -256,7 +262,7 @@ WLEngine::processNextWLReduceEvent(){ } updateQ.pop(); if (!updateQ.blocked() & updateQ.sendPktRetry){ - respPort.trySendRetry(); + // respPort.trySendRetry(); updateQ.sendPktRetry = false; } diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh index ad53fd7e7e..fe26d22aef 100644 --- a/src/accl/wl_engine.hh +++ b/src/accl/wl_engine.hh @@ -172,14 +172,14 @@ class WLEngine : public ClockedObject WLQueue updateQueue; WLQueue responseQueue; - - - public: AddrRangeList getAddrRanges() const; bool handleWLUpdate(PacketPtr pkt); bool handleMemResp(PacketPtr resp); void recvFunctional(PacketPtr pkt); + + public: WLEngine(const WLEngineParams ¶ms); + Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; }; From b1e3386565a90f3c4170c72688da1e7f01a3ef7f Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 15 Feb 2022 09:56:40 -0800 Subject: [PATCH 027/247] Bug fix. --- src/accl/push_engine.hh | 5 +---- src/accl/wl_engine.cc | 2 +- src/accl/wl_engine.hh | 28 +++++++++++++--------------- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh index 269170c045..ea9026ff8f 100644 --- a/src/accl/push_engine.hh +++ b/src/accl/push_engine.hh @@ -49,13 +49,10 @@ class PushEngine : public ClockedObject { private: PushEngine* owner; - bool _blocked; - PacketPtr blockedPacket; public: PushRespPort(const std::string& name, PushEngine* owner): - ResponsePort(name, owner), owner(owner), - _blocked(false), blockedPacket(nullptr) + ResponsePort(name, owner), owner(owner) {} virtual AddrRangeList getAddrRanges(); diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc index 872f38673e..98c940a2de 100644 --- a/src/accl/wl_engine.cc +++ b/src/accl/wl_engine.cc @@ -262,7 +262,7 @@ WLEngine::processNextWLReduceEvent(){ } updateQ.pop(); if (!updateQ.blocked() & updateQ.sendPktRetry){ - // respPort.trySendRetry(); + respPort.trySendRetry(); updateQ.sendPktRetry = false; } diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh index fe26d22aef..94ac7c7aff 100644 --- a/src/accl/wl_engine.hh +++ b/src/accl/wl_engine.hh @@ -82,12 +82,11 @@ class WLEngine : public ClockedObject { private: WLEngine *owner; - PacketPtr blockedPacket; public: - WLRespPort(const std::string& name, SimObject* _owner, - PortID id=InvalidPortID); - + WLRespPort(const std::string& name, WLEngine* owner): + ResponsePort(name, owner), owner(owner) + {} virtual AddrRangeList getAddrRanges(); protected: @@ -105,12 +104,12 @@ class WLEngine : public ClockedObject PacketPtr blockedPacket; public: - WLReqPort(const std::string& name, SimObject* _owner, - PortID id=InvalidPortID); + WLReqPort(const std::string& name, WLEngine* owner): + RequestPort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} void sendPacket(PacketPtr pkt); - bool blocked(){ - return _blocked; - } + bool blocked() { return _blocked; } protected: void recvReqRetry() override; @@ -125,13 +124,12 @@ class WLEngine : public ClockedObject PacketPtr blockedPacket; public: - WLMemPort(const std::string& name, SimObject* _owner, - PortID id=InvalidPortID); + WLMemPort(const std::string& name, WLEngine* owner): + RequestPort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} void sendPacket(PacketPtr pkt); - void trySendRetry(); - bool blocked(){ - return _blocked; - } + bool blocked() { return _blocked; } protected: virtual bool recvTimingResp(PacketPtr pkt); From 4541367e7f3091feb30a81c403cbdd9d1d1e9b0b Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 15 Feb 2022 13:44:39 -0800 Subject: [PATCH 028/247] Bug fix. --- src/accl/apply.cc | 12 ------------ src/accl/apply.hh | 34 +++++++++++++++------------------- src/accl/push_engine.cc | 2 +- src/accl/push_engine.hh | 2 +- src/accl/wl_engine.hh | 2 +- 5 files changed, 18 insertions(+), 34 deletions(-) diff --git a/src/accl/apply.cc b/src/accl/apply.cc index 410eff5268..b493d3d152 100644 --- a/src/accl/apply.cc +++ b/src/accl/apply.cc @@ -75,12 +75,6 @@ bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt) return true; } -void -Apply::ApplyRespPort::trySendRetry() -{ - sendRetryReq(); -} - void Apply::ApplyRespPort::recvFunctional(PacketPtr pkt) { @@ -116,12 +110,6 @@ Apply::ApplyMemPort::sendPacket(PacketPtr pkt) } } -void -Apply::ApplyMemPort::trySendRetry() -{ - sendRetryResp(); -} - void Apply::ApplyMemPort::recvReqRetry() { diff --git a/src/accl/apply.hh b/src/accl/apply.hh index f08c1fef85..6ab639c552 100644 --- a/src/accl/apply.hh +++ b/src/accl/apply.hh @@ -64,7 +64,7 @@ class Apply : public ClockedObject } PacketPtr pop(){ - return applyQueue->pop(); + return applyQueue.pop(); } PacketPtr front(){ @@ -79,16 +79,12 @@ class Apply : public ClockedObject { private: Apply *owner; - bool _blocked; - PacketPtr blockedPacket; public: - void trySendRetry(); - virtual AddrRangeList getAddrRanges(); ApplyRespPort(const std::string& name, Apply* owner): - ResponsePort(name, owner), owner(owner), - _blocked(false), blockedPacket(nullptr) + ResponsePort(name, owner), owner(owner) {} + virtual AddrRangeList getAddrRanges() const; protected: virtual bool recvTimingReq(PacketPtr pkt); @@ -140,16 +136,24 @@ class Apply : public ClockedObject void recvReqRetry() override; }; + System* const system; + const RequestorID requestorId; + ApplyMemPort memPort; ApplyRespPort respPort; ApplyReqPort reqPort; + ApplyQueue applyReadQueue; + ApplyQueue applyWriteQueue; + + std::unordered_map requestOffset; + bool handleWL(PacketPtr pkt); - bool sendPacket(); - //one queue for write and one for read a priotizes write over read - void readApplyBuffer(); + // bool sendPacket(); + // //one queue for write and one for read a priotizes write over read + // void readApplyBuffer(); bool handleMemResp(PacketPtr resp); - void writePushBuffer(); + // void writePushBuffer(); //Events void processNextApplyCheckEvent(); @@ -166,16 +170,8 @@ class Apply : public ClockedObject Write edgelist loc in buffer */ - System* const system; - const RequestorID requestorId; - AddrRangeList getAddrRanges() const; - ApplyQueue applyReadQueue; - ApplyQueue applyWriteQueue; - - std::unordered_map requestOffset; - public: Apply(const ApplyParams &apply); diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc index bf385818f5..fde79a5aa7 100644 --- a/src/accl/push_engine.cc +++ b/src/accl/push_engine.cc @@ -99,7 +99,7 @@ PushEngine::startup() } AddrRangeList -PushEngine::PushRespPort::getAddrRanges() +PushEngine::PushRespPort::getAddrRanges() const { return owner->getAddrRanges(); } diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh index ea9026ff8f..fbb7d6915a 100644 --- a/src/accl/push_engine.hh +++ b/src/accl/push_engine.hh @@ -54,7 +54,7 @@ class PushEngine : public ClockedObject PushRespPort(const std::string& name, PushEngine* owner): ResponsePort(name, owner), owner(owner) {} - virtual AddrRangeList getAddrRanges(); + virtual AddrRangeList getAddrRanges() const; protected: virtual bool recvTimingReq(PacketPtr pkt); diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh index 94ac7c7aff..504b63bc46 100644 --- a/src/accl/wl_engine.hh +++ b/src/accl/wl_engine.hh @@ -87,7 +87,7 @@ class WLEngine : public ClockedObject WLRespPort(const std::string& name, WLEngine* owner): ResponsePort(name, owner), owner(owner) {} - virtual AddrRangeList getAddrRanges(); + virtual AddrRangeList getAddrRanges() const; protected: virtual bool recvTimingReq(PacketPtr pkt); From eb31d031f86ed681b6e974aeda16456daf0e67ef Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Tue, 15 Feb 2022 17:02:08 -0800 Subject: [PATCH 029/247] Apply engine compiles --- src/accl/apply.cc | 33 +++++++++++++++++++++++++++------ src/accl/apply.hh | 45 ++++++++++++++++++++++----------------------- 2 files changed, 49 insertions(+), 29 deletions(-) diff --git a/src/accl/apply.cc b/src/accl/apply.cc index b493d3d152..55288693f3 100644 --- a/src/accl/apply.cc +++ b/src/accl/apply.cc @@ -41,10 +41,12 @@ Apply::Apply(const ApplyParams ¶ms): respPort(name() + ".respPort", this), memPort(name() + ".memPort", this), nextApplyEvent([this]{ processNextApplyEvent(); }, name()), - nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()) + nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()), + applyReadQueue(params.applyQueueSize), + applyWriteQueue(params.applyQueueSize) { - applyReadQueue(params.applyQueueSize); - applyWriteQueue(params.applyQueueSize); + // applyReadQueue(params.applyQueueSize); + // applyWriteQueue(params.applyQueueSize); } Port & @@ -62,7 +64,7 @@ Apply::getPort(const std::string &if_name, PortID idx) } AddrRangeList -Apply::ApplyRespPort::getAddrRanges() +Apply::ApplyRespPort::getAddrRanges() const { return owner->getAddrRanges(); } @@ -93,6 +95,12 @@ Apply::ApplyRespPort::recvRespRetry() panic("recvRespRetry from response port is called."); } +void +Apply::ApplyRespPort::trySendRetry() +{ + sendRetryReq(); +} + bool Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt) { @@ -118,6 +126,12 @@ Apply::ApplyMemPort::recvReqRetry() blockedPacket = nullptr; } +void +Apply::ApplyMemPort::trySendRetry() +{ + sendRetryResp(); +} + void Apply::ApplyReqPort::sendPacket(PacketPtr pkt) { @@ -135,6 +149,12 @@ Apply::ApplyReqPort::recvReqRetry() blockedPacket = nullptr; } +bool +Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt) +{ + panic("recvRespRetry from response port is called."); +} + AddrRangeList Apply::getAddrRanges() const { @@ -158,7 +178,8 @@ bool Apply::handleWL(PacketPtr pkt){ void Apply::processNextApplyCheckEvent(){ auto queue = applyReadQueue; if (!memPort.blocked()){ - auto pkt = queue.pop(); + PacketPtr pkt = queue.front(); + queue.pop(); if (queue.sendPktRetry && !queue.blocked()){ respPort.trySendRetry(); queue.sendPktRetry = false; @@ -229,7 +250,7 @@ Apply::processNextApplyEvent(){ } } }else{ - queue.pop(); + queue.applyQueue.pop(); if (queue.sendPktRetry && !queue.blocked()){ memPort.trySendRetry(); queue.sendPktRetry = false; diff --git a/src/accl/apply.hh b/src/accl/apply.hh index 6ab639c552..7f17e173c6 100644 --- a/src/accl/apply.hh +++ b/src/accl/apply.hh @@ -63,8 +63,8 @@ class Apply : public ClockedObject applyQueue.push(pkt); } - PacketPtr pop(){ - return applyQueue.pop(); + void pop(){ + applyQueue.pop(); } PacketPtr front(){ @@ -72,20 +72,20 @@ class Apply : public ClockedObject } ApplyQueue(uint32_t qSize): - queueSize(qSize){} + queueSize(qSize), + sendPktRetry(false){} }; class ApplyRespPort : public ResponsePort { private: Apply *owner; - public: ApplyRespPort(const std::string& name, Apply* owner): ResponsePort(name, owner), owner(owner) {} virtual AddrRangeList getAddrRanges() const; - + void trySendRetry(); protected: virtual bool recvTimingReq(PacketPtr pkt); virtual Tick recvAtomic(PacketPtr pkt); @@ -105,7 +105,6 @@ class Apply : public ClockedObject RequestPort(name, owner), owner(owner), _blocked(false), blockedPacket(nullptr) {} - void sendPacket(PacketPtr pkt); bool blocked() { return _blocked; } @@ -139,9 +138,24 @@ class Apply : public ClockedObject System* const system; const RequestorID requestorId; - ApplyMemPort memPort; - ApplyRespPort respPort; ApplyReqPort reqPort; + ApplyRespPort respPort; + ApplyMemPort memPort; + + EventFunctionWrapper nextApplyEvent; + void processNextApplyEvent(); + /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp + Perform apply and send the write request and read edgeList + read + write + Write edgelist loc in buffer + */ + + EventFunctionWrapper nextApplyCheckEvent; + void processNextApplyCheckEvent(); + /* Syncronously checked + If there are any active vertecies: + create memory read packets + MPU::MPU::MemPortsendTimingReq + */ ApplyQueue applyReadQueue; ApplyQueue applyWriteQueue; @@ -155,21 +169,6 @@ class Apply : public ClockedObject bool handleMemResp(PacketPtr resp); // void writePushBuffer(); - //Events - void processNextApplyCheckEvent(); - EventFunctionWrapper nextApplyCheckEvent; - /* Syncronously checked - If there are any active vertecies: - create memory read packets + MPU::MPU::MemPortsendTimingReq - */ - void processNextApplyEvent(); - EventFunctionWrapper nextApplyEvent; - /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp - Perform apply and send the write request and read edgeList - read + write - Write edgelist loc in buffer - */ - AddrRangeList getAddrRanges() const; public: From e3a7f1c1d727c2497e10003d781f404771345a5b Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 15 Feb 2022 17:22:29 -0800 Subject: [PATCH 030/247] Bug fix. Very close to first compilation. --- src/accl/apply.cc | 60 +++++++------------ src/accl/apply.hh | 87 ++++++++++++++------------- src/accl/push_engine.cc | 126 ++++++++++++++++++++++------------------ src/accl/util.hh | 14 +++++ src/accl/wl_engine.cc | 22 ++----- src/accl/wl_engine.hh | 3 +- 6 files changed, 153 insertions(+), 159 deletions(-) diff --git a/src/accl/apply.cc b/src/accl/apply.cc index 55288693f3..9c3d3f1c3d 100644 --- a/src/accl/apply.cc +++ b/src/accl/apply.cc @@ -30,6 +30,8 @@ #include +#include "accl/util.hh" + namespace gem5 { @@ -37,17 +39,14 @@ Apply::Apply(const ApplyParams ¶ms): ClockedObject(params), system(params.system), requestorId(system->getRequestorId(this)), - reqPort(name() + ".reqPort", this), respPort(name() + ".respPort", this), + reqPort(name() + ".reqPort", this), memPort(name() + ".memPort", this), - nextApplyEvent([this]{ processNextApplyEvent(); }, name()), - nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()), applyReadQueue(params.applyQueueSize), - applyWriteQueue(params.applyQueueSize) -{ - // applyReadQueue(params.applyQueueSize); - // applyWriteQueue(params.applyQueueSize); -} + applyWriteQueue(params.applyQueueSize), + nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()), + nextApplyEvent([this]{ processNextApplyEvent(); }, name()) +{} Port & Apply::getPort(const std::string &if_name, PortID idx) @@ -96,22 +95,8 @@ Apply::ApplyRespPort::recvRespRetry() } void -Apply::ApplyRespPort::trySendRetry() -{ - sendRetryReq(); -} - -bool -Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt) -{ - return owner->handleMemResp(pkt); -} - -void -Apply::ApplyMemPort::sendPacket(PacketPtr pkt) +Apply::ApplyReqPort::sendPacket(PacketPtr pkt) { - panic_if(_blocked, "Should never try to send if blocked MemSide!"); - if (!sendTimingReq(pkt)) { blockedPacket = pkt; _blocked = true; @@ -119,30 +104,27 @@ Apply::ApplyMemPort::sendPacket(PacketPtr pkt) } void -Apply::ApplyMemPort::recvReqRetry() +Apply::ApplyReqPort::recvReqRetry() { _blocked = false; sendPacket(blockedPacket); blockedPacket = nullptr; } -void -Apply::ApplyMemPort::trySendRetry() +bool +Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt) { - sendRetryResp(); + panic("recvTimingResp called on reqPort."); } -void -Apply::ApplyReqPort::sendPacket(PacketPtr pkt) +bool +Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt) { - if (!sendTimingReq(pkt)) { - blockedPacket = pkt; - _blocked = true; - } + return owner->handleMemResp(pkt); } void -Apply::ApplyReqPort::recvReqRetry() +Apply::ApplyMemPort::recvReqRetry() { _blocked = false; sendPacket(blockedPacket); @@ -179,9 +161,8 @@ void Apply::processNextApplyCheckEvent(){ auto queue = applyReadQueue; if (!memPort.blocked()){ PacketPtr pkt = queue.front(); - queue.pop(); if (queue.sendPktRetry && !queue.blocked()){ - respPort.trySendRetry(); + // respPort.trySendRetry(); queue.sendPktRetry = false; } // conver to ReadReq @@ -190,7 +171,8 @@ void Apply::processNextApplyCheckEvent(){ RequestPtr request = std::make_shared(req_addr, 64, 0 ,0); PacketPtr memPkt = new Packet(request, MemCmd::ReadReq); requestOffset[request] = req_offset; - memPort.sendPacket(memPkt); + memPort.sendPacke:(memPkt); + queue.pop(); } if (!queue.empty() && !nextApplyCheckEvent.scheduled()){ schedule(nextApplyCheckEvent, nextCycle()); @@ -245,14 +227,14 @@ Apply::processNextApplyEvent(){ reqPort.sendPacket(writePkt); queue.pop(); if (queue.sendPktRetry && !queue.blocked()){ - memPort.trySendRetry(); + // memPort.trySendRetry(); queue.sendPktRetry = false; } } }else{ queue.applyQueue.pop(); if (queue.sendPktRetry && !queue.blocked()){ - memPort.trySendRetry(); + // memPort.trySendRetry(); queue.sendPktRetry = false; } } diff --git a/src/accl/apply.hh b/src/accl/apply.hh index 7f17e173c6..2a16632e22 100644 --- a/src/accl/apply.hh +++ b/src/accl/apply.hh @@ -32,7 +32,6 @@ #include #include -#include "accl/util.hh" #include "base/addr_range.hh" #include "mem/packet.hh" #include "mem/port.hh" @@ -49,31 +48,31 @@ class Apply : public ClockedObject private: struct ApplyQueue{ - std::queue applyQueue; - const uint32_t queueSize; - bool sendPktRetry; - - bool blocked(){ - return (applyQueue.size() == queueSize); - } - bool empty(){ - return applyQueue.empty(); - } - void push(PacketPtr pkt){ - applyQueue.push(pkt); - } - - void pop(){ - applyQueue.pop(); - } - - PacketPtr front(){ - return applyQueue.front(); - } - - ApplyQueue(uint32_t qSize): - queueSize(qSize), - sendPktRetry(false){} + std::queue applyQueue; + const uint32_t queueSize; + bool sendPktRetry; + + bool blocked(){ + return (applyQueue.size() == queueSize); + } + bool empty(){ + return applyQueue.empty(); + } + void push(PacketPtr pkt){ + applyQueue.push(pkt); + } + + void pop(){ + applyQueue.pop(); + } + + PacketPtr front(){ + return applyQueue.front(); + } + + ApplyQueue(uint32_t qSize): + queueSize(qSize) + {} }; class ApplyRespPort : public ResponsePort @@ -109,8 +108,8 @@ class Apply : public ClockedObject bool blocked() { return _blocked; } protected: - void recvReqRetry() override; virtual bool recvTimingResp(PacketPtr pkt); + void recvReqRetry() override; }; class ApplyMemPort : public RequestPort @@ -127,7 +126,7 @@ class Apply : public ClockedObject {} void sendPacket(PacketPtr pkt); - void trySendRetry(); + // void trySendRetry(); bool blocked(){ return _blocked;} protected: @@ -138,25 +137,10 @@ class Apply : public ClockedObject System* const system; const RequestorID requestorId; - ApplyReqPort reqPort; ApplyRespPort respPort; + ApplyReqPort reqPort; ApplyMemPort memPort; - EventFunctionWrapper nextApplyEvent; - void processNextApplyEvent(); - /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp - Perform apply and send the write request and read edgeList - read + write - Write edgelist loc in buffer - */ - - EventFunctionWrapper nextApplyCheckEvent; - void processNextApplyCheckEvent(); - /* Syncronously checked - If there are any active vertecies: - create memory read packets + MPU::MPU::MemPortsendTimingReq - */ - ApplyQueue applyReadQueue; ApplyQueue applyWriteQueue; @@ -169,6 +153,21 @@ class Apply : public ClockedObject bool handleMemResp(PacketPtr resp); // void writePushBuffer(); + //Events + EventFunctionWrapper nextApplyCheckEvent; + void processNextApplyCheckEvent(); + /* Syncronously checked + If there are any active vertecies: + create memory read packets + MPU::MPU::MemPortsendTimingReq + */ + EventFunctionWrapper nextApplyEvent; + void processNextApplyEvent(); + /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp + Perform apply and send the write request and read edgeList + read + write + Write edgelist loc in buffer + */ + AddrRangeList getAddrRanges() const; public: diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc index fde79a5aa7..125433653b 100644 --- a/src/accl/push_engine.cc +++ b/src/accl/push_engine.cc @@ -26,9 +26,10 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "accl/util.hh" #include "accl/push_engine.hh" +#include "accl/util.hh" + namespace gem5 { @@ -128,6 +129,68 @@ PushEngine::PushRespPort::recvRespRetry() panic("recvRespRetry from response port is called."); } +void +PushEngine::PushReqPort::sendPacket(PacketPtr pkt) +{ + panic_if(_blocked, "Should never try to send if blocked MemSide!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + blockedPacket = pkt; + _blocked = true; + } +} + +bool +PushEngine::PushReqPort::recvTimingResp(PacketPtr pkt) +{ + panic("recvTimingResp called on the request port."); +} + +void +PushEngine::PushReqPort::recvReqRetry() +{ + panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); + + _blocked = false; + sendPacket(blockedPacket); + + if (!blocked()) { + blockedPacket = nullptr; + } +} + +bool +PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt) +{ + return owner->handleMemResp(pkt); +} + +void +PushEngine::PushMemPort::sendPacket(PacketPtr pkt) +{ + panic_if(_blocked, "Should never try to send if blocked MemSide!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + blockedPacket = pkt; + _blocked = true; + } +} + +void +PushEngine::PushMemPort::recvReqRetry() +{ + panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); + + _blocked = false; + sendPacket(blockedPacket); + + if (!blocked()) { + blockedPacket = nullptr; + } +} + AddrRangeList PushEngine::getAddrRanges() { @@ -224,24 +287,8 @@ void PushEngine::processNextReadEvent() } } -bool PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt) -{ - return owner->handleMemResp(pkt); -} - -void -PushEngine::PushMemPort::sendPacket(PacketPtr pkt) -{ - panic_if(_blocked, "Should never try to send if blocked MemSide!"); - // If we can't send the packet across the port, store it for later. - if (!sendTimingReq(pkt)) - { - blockedPacket = pkt; - _blocked = true; - } -} - -bool PushEngine::handleMemResp(PacketPtr pkt) +bool +PushEngine::handleMemResp(PacketPtr pkt) { RequestPtr req = pkt->req; uint8_t *data = pkt->getPtr(); @@ -259,7 +306,8 @@ bool PushEngine::handleMemResp(PacketPtr pkt) // TODO: Implement propagate function here *update_data = value + 1; PacketPtr update = getUpdatePacket(e.neighbor, - sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data); + sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data, + requestorId); updateQueue.push(update); } @@ -286,42 +334,4 @@ void PushEngine::processNextSendEvent() } } -void -PushEngine::PushReqPort::sendPacket(PacketPtr pkt) -{ - panic_if(_blocked, "Should never try to send if blocked MemSide!"); - // If we can't send the packet across the port, store it for later. - if (!sendTimingReq(pkt)) - { - blockedPacket = pkt; - _blocked = true; - } -} - -void -PushEngine::PushReqPort::recvReqRetry() -{ - panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); - - _blocked = false; - sendPacket(blockedPacket); - - if (!blocked()) { - blockedPacket = nullptr; - } -} - -void -PushEngine::PushMemPort::recvReqRetry() -{ - panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); - - _blocked = false; - sendPacket(blockedPacket); - - if (!blocked()) { - blockedPacket = nullptr; - } -} - } diff --git a/src/accl/util.hh b/src/accl/util.hh index 91692488a4..b3cff93f15 100644 --- a/src/accl/util.hh +++ b/src/accl/util.hh @@ -26,6 +26,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include "base/cprintf.hh" #include "base/types.hh" #include "mem/packet.hh" #include "mem/request.hh" @@ -39,12 +40,25 @@ struct WorkListItem uint32_t prop; uint32_t degree; uint32_t edgeIndex; + + std::string to_string() + { + return csprintf( + "WorkListItem{temp_prop: %u, prop: %u, degree: %u, edgeIndex: %u}", + temp_prop, prop, degree, edgeIndex); + } + }; struct Edge { uint64_t weight; Addr neighbor; + + std::string to_string() + { + return csprintf("Edge{weight: %lu, neighbor: %lu}", weight, neighbor); + } }; WorkListItem memoryToWorkList(uint8_t* data); diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc index 98c940a2de..eb883cb19b 100644 --- a/src/accl/wl_engine.cc +++ b/src/accl/wl_engine.cc @@ -30,6 +30,8 @@ #include +#include "accl/util.hh" + namespace gem5 { @@ -76,12 +78,6 @@ bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt) return true; } -void -WLEngine::WLRespPort::recvFunctional(PacketPtr pkt) -{ - owner->recvFunctional(pkt); -} - Tick WLEngine::WLRespPort::recvAtomic(PacketPtr pkt) { @@ -125,12 +121,6 @@ WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt) return owner->handleMemResp(pkt); } -void -WLEngine::WLMemPort::trySendRetry() -{ - sendRetryResp(); -} - void WLEngine::WLReqPort::recvReqRetry() { @@ -244,12 +234,12 @@ WLEngine::processNextWLReduceEvent(){ applyPort.sendPacket(writePkt); queue.pop(); if (!queue.blocked() && queue.sendPktRetry){ - memPort.trySendRetry(); + // memPort.trySendRetry(); queue.sendPktRetry = false; } updateQ.pop(); if (!updateQ.blocked() & updateQ.sendPktRetry){ - respPort.trySendRetry(); + // respPort.trySendRetry(); updateQ.sendPktRetry = false; } } @@ -257,12 +247,12 @@ WLEngine::processNextWLReduceEvent(){ else{ queue.pop(); if (!queue.blocked() && queue.sendPktRetry){ - memPort.trySendRetry(); + // memPort.trySendRetry(); queue.sendPktRetry = false; } updateQ.pop(); if (!updateQ.blocked() & updateQ.sendPktRetry){ - respPort.trySendRetry(); + // respPort.trySendRetry(); updateQ.sendPktRetry = false; } diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh index 504b63bc46..ee25154caa 100644 --- a/src/accl/wl_engine.hh +++ b/src/accl/wl_engine.hh @@ -32,7 +32,6 @@ #include #include -#include "accl/util.hh" #include "base/addr_range.hh" #include "mem/port.hh" #include "mem/packet.hh" @@ -140,8 +139,8 @@ class WLEngine : public ClockedObject const uint32_t queueSize; const RequestorID requestorId; - WLReqPort reqPort; WLRespPort respPort; + WLReqPort reqPort; WLMemPort memPort; bool handleWLU(PacketPtr pkt); From 099f68905a083c566dcb1334b9c1b4fae3c1edcf Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 15 Feb 2022 17:46:20 -0800 Subject: [PATCH 031/247] More bug fixes. --- src/accl/apply.cc | 8 +------- src/accl/util.hh | 3 ++- src/accl/wl_engine.cc | 12 +++++------- src/accl/wl_engine.hh | 18 ++++++------------ 4 files changed, 14 insertions(+), 27 deletions(-) diff --git a/src/accl/apply.cc b/src/accl/apply.cc index 9c3d3f1c3d..b18c990da2 100644 --- a/src/accl/apply.cc +++ b/src/accl/apply.cc @@ -131,12 +131,6 @@ Apply::ApplyMemPort::recvReqRetry() blockedPacket = nullptr; } -bool -Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt) -{ - panic("recvRespRetry from response port is called."); -} - AddrRangeList Apply::getAddrRanges() const { @@ -171,7 +165,7 @@ void Apply::processNextApplyCheckEvent(){ RequestPtr request = std::make_shared(req_addr, 64, 0 ,0); PacketPtr memPkt = new Packet(request, MemCmd::ReadReq); requestOffset[request] = req_offset; - memPort.sendPacke:(memPkt); + memPort.sendPacket(memPkt); queue.pop(); } if (!queue.empty() && !nextApplyCheckEvent.scheduled()){ diff --git a/src/accl/util.hh b/src/accl/util.hh index b3cff93f15..a4418a1cb8 100644 --- a/src/accl/util.hh +++ b/src/accl/util.hh @@ -71,6 +71,7 @@ PacketPtr getReadPacket(Addr addr, unsigned int size, RequestorID requestorId); PacketPtr getWritePacket(Addr addr, unsigned int size, uint8_t* data, RequestorID requestorId); -PacketPtr getUpdatePacket(Addr addr, unsigned int size, uint8_t *data); +PacketPtr getUpdatePacket(Addr addr, unsigned int size, + uint8_t *data, RequestorID requestorId); } diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc index eb883cb19b..614f34d175 100644 --- a/src/accl/wl_engine.cc +++ b/src/accl/wl_engine.cc @@ -38,17 +38,15 @@ namespace gem5 WLEngine::WLEngine(const WLEngineParams ¶ms): ClockedObject(params), system(params.system), - queueSize(params.wlQueueSize), requestorId(system->getRequestorId(this)), - reqPort(name() + ".reqPort", this), respPort(name() + ".respPort", this), + reqPort(name() + ".reqPort", this), memPort(name() + ".memPort", this), + updateQueue(params.wlQueueSize), + responseQueue(params.wlQueueSize), nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()), - nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name()), - updateQueue(queueSize), - responseQueue(queueSize) -{ -} + nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name()) +{} Port & WLEngine::getPort(const std::string &if_name, PortID idx) diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh index ee25154caa..57cc063880 100644 --- a/src/accl/wl_engine.hh +++ b/src/accl/wl_engine.hh @@ -136,26 +136,26 @@ class WLEngine : public ClockedObject }; System* const system; - const uint32_t queueSize; const RequestorID requestorId; WLRespPort respPort; WLReqPort reqPort; WLMemPort memPort; - bool handleWLU(PacketPtr pkt); - bool sendPacket(); - //one queue for write and one for read a priotizes write over read - void readWLBuffer(); + WLQueue updateQueue; + WLQueue responseQueue; + std::unordered_map requestOffset; //Events + bool handleWLUpdate(PacketPtr pkt); EventFunctionWrapper nextWLReadEvent; void processNextWLReadEvent(); /* Syncronously checked If there are any active vertecies: create memory read packets + MPU::MPU::MemPortsendTimingReq */ + bool handleMemResp(PacketPtr resp); EventFunctionWrapper nextWLReduceEvent; void processNextWLReduceEvent(); /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp @@ -164,14 +164,8 @@ class WLEngine : public ClockedObject Write edgelist loc in buffer */ - std::unordered_map requestOffset; - - WLQueue updateQueue; - WLQueue responseQueue; - AddrRangeList getAddrRanges() const; - bool handleWLUpdate(PacketPtr pkt); - bool handleMemResp(PacketPtr resp); + void recvFunctional(PacketPtr pkt); public: From 793d75564e15d66b6d8e81f2a75dfd324465eb41 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 15 Feb 2022 17:53:21 -0800 Subject: [PATCH 032/247] Compilation. yeay. --- src/accl/apply.cc | 9 +++++++++ src/accl/wl_engine.cc | 40 +++++++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/src/accl/apply.cc b/src/accl/apply.cc index b18c990da2..40002c5264 100644 --- a/src/accl/apply.cc +++ b/src/accl/apply.cc @@ -117,6 +117,15 @@ Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt) panic("recvTimingResp called on reqPort."); } +void +Apply::ApplyMemPort::sendPacket(PacketPtr pkt) +{ + if (!sendTimingReq(pkt)) { + blockedPacket = pkt; + _blocked = true; + } +} + bool Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt) { diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc index 614f34d175..d2ecd0d7c9 100644 --- a/src/accl/wl_engine.cc +++ b/src/accl/wl_engine.cc @@ -94,17 +94,14 @@ WLEngine::WLRespPort::recvRespRetry() panic("recvRespRetry from response port is called."); } -void -WLEngine::WLMemPort::sendPacket(PacketPtr pkt) +bool +WLEngine::WLReqPort::recvTimingResp(PacketPtr) { - if (!sendTimingReq(pkt)) { - blockedPacket = pkt; - _blocked = true; - } + panic("recvTimingResp called on the request port."); } void -WLEngine::WLMemPort::recvReqRetry() +WLEngine::WLReqPort::recvReqRetry() { // We should have a blocked packet if this function is called. assert(_blocked && blockedPacket != nullptr); @@ -113,14 +110,26 @@ WLEngine::WLMemPort::recvReqRetry() blockedPacket = nullptr; } -bool -WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt) +void +WLEngine::WLReqPort::sendPacket(PacketPtr pkt) { - return owner->handleMemResp(pkt); + if (!sendTimingReq(pkt)) { + blockedPacket = pkt; + _blocked = true; + } } void -WLEngine::WLReqPort::recvReqRetry() +WLEngine::WLMemPort::sendPacket(PacketPtr pkt) +{ + if (!sendTimingReq(pkt)) { + blockedPacket = pkt; + _blocked = true; + } +} + +void +WLEngine::WLMemPort::recvReqRetry() { // We should have a blocked packet if this function is called. assert(_blocked && blockedPacket != nullptr); @@ -129,13 +138,10 @@ WLEngine::WLReqPort::recvReqRetry() blockedPacket = nullptr; } -void -WLEngine::WLReqPort::sendPacket(PacketPtr pkt) +bool +WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt) { - if (!sendTimingReq(pkt)) { - blockedPacket = pkt; - _blocked = true; - } + return owner->handleMemResp(pkt); } AddrRangeList From 5e05fe3d6caa51cada748e2dc6e2e200c84932c7 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 16 Feb 2022 10:31:28 -0800 Subject: [PATCH 033/247] Fixing a typo. --- src/accl/PushEngine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/accl/PushEngine.py b/src/accl/PushEngine.py index 3215fdaee2..840d8dea1f 100644 --- a/src/accl/PushEngine.py +++ b/src/accl/PushEngine.py @@ -30,7 +30,7 @@ from m5.objects.ClockedObject import ClockedObject class PushEngine(ClockedObject): - type = 'WLEngine' + type = 'PushEngine' cxx_header = "accl/push_engine.hh" cxx_class = 'gem5::PushEngine' From f35e40e74c7b42f5cd3ffc68b89ef2a714f5dab9 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 18 Feb 2022 14:08:41 -0800 Subject: [PATCH 034/247] Restructuring the directory. --- src/accl/{ => graph/base}/Apply.py | 0 src/accl/{ => graph/base}/PushEngine.py | 0 src/accl/{ => graph/base}/SConscript | 0 src/accl/{ => graph/base}/WLEngine.py | 0 src/accl/{ => graph/base}/apply.cc | 73 +-------------------- src/accl/{ => graph/base}/apply.hh | 44 +------------ src/accl/{ => graph/base}/push_engine.cc | 0 src/accl/{ => graph/base}/push_engine.hh | 0 src/accl/{ => graph/base}/util.cc | 0 src/accl/{ => graph/base}/util.hh | 0 src/accl/{ => graph/base}/wl_engine.cc | 83 +----------------------- src/accl/{ => graph/base}/wl_engine.hh | 49 +------------- src/accl/graph/sega/mpu.hh | 0 13 files changed, 7 insertions(+), 242 deletions(-) rename src/accl/{ => graph/base}/Apply.py (100%) rename src/accl/{ => graph/base}/PushEngine.py (100%) rename src/accl/{ => graph/base}/SConscript (100%) rename src/accl/{ => graph/base}/WLEngine.py (100%) rename src/accl/{ => graph/base}/apply.cc (80%) rename src/accl/{ => graph/base}/apply.hh (78%) rename src/accl/{ => graph/base}/push_engine.cc (100%) rename src/accl/{ => graph/base}/push_engine.hh (100%) rename src/accl/{ => graph/base}/util.cc (100%) rename src/accl/{ => graph/base}/util.hh (100%) rename src/accl/{ => graph/base}/wl_engine.cc (79%) rename src/accl/{ => graph/base}/wl_engine.hh (75%) create mode 100644 src/accl/graph/sega/mpu.hh diff --git a/src/accl/Apply.py b/src/accl/graph/base/Apply.py similarity index 100% rename from src/accl/Apply.py rename to src/accl/graph/base/Apply.py diff --git a/src/accl/PushEngine.py b/src/accl/graph/base/PushEngine.py similarity index 100% rename from src/accl/PushEngine.py rename to src/accl/graph/base/PushEngine.py diff --git a/src/accl/SConscript b/src/accl/graph/base/SConscript similarity index 100% rename from src/accl/SConscript rename to src/accl/graph/base/SConscript diff --git a/src/accl/WLEngine.py b/src/accl/graph/base/WLEngine.py similarity index 100% rename from src/accl/WLEngine.py rename to src/accl/graph/base/WLEngine.py diff --git a/src/accl/apply.cc b/src/accl/graph/base/apply.cc similarity index 80% rename from src/accl/apply.cc rename to src/accl/graph/base/apply.cc index 40002c5264..eae9c2fd16 100644 --- a/src/accl/apply.cc +++ b/src/accl/graph/base/apply.cc @@ -30,17 +30,13 @@ #include -#include "accl/util.hh" +#include "accl/graph/base/util.hh" namespace gem5 { Apply::Apply(const ApplyParams ¶ms): ClockedObject(params), - system(params.system), - requestorId(system->getRequestorId(this)), - respPort(name() + ".respPort", this), - reqPort(name() + ".reqPort", this), memPort(name() + ".memPort", this), applyReadQueue(params.applyQueueSize), applyWriteQueue(params.applyQueueSize), @@ -51,72 +47,13 @@ Apply::Apply(const ApplyParams ¶ms): Port & Apply::getPort(const std::string &if_name, PortID idx) { - if (if_name == "reqPort") { - return reqPort; - } else if (if_name == "respPort") { - return respPort; - } else if (if_name == "memPort") { + if (if_name == "memPort") { return memPort; } else { return SimObject::getPort(if_name, idx); } } -AddrRangeList -Apply::ApplyRespPort::getAddrRanges() const -{ - return owner->getAddrRanges(); -} - -bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt) -{ - if (!owner->handleWL(pkt)){ - return false; - } - return true; -} - -void -Apply::ApplyRespPort::recvFunctional(PacketPtr pkt) -{ - panic("Not implemented"); -} - -Tick -Apply::ApplyRespPort::recvAtomic(PacketPtr pkt) -{ - panic("recvAtomic unimpl."); -} - -void -Apply::ApplyRespPort::recvRespRetry() -{ - panic("recvRespRetry from response port is called."); -} - -void -Apply::ApplyReqPort::sendPacket(PacketPtr pkt) -{ - if (!sendTimingReq(pkt)) { - blockedPacket = pkt; - _blocked = true; - } -} - -void -Apply::ApplyReqPort::recvReqRetry() -{ - _blocked = false; - sendPacket(blockedPacket); - blockedPacket = nullptr; -} - -bool -Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt) -{ - panic("recvTimingResp called on reqPort."); -} - void Apply::ApplyMemPort::sendPacket(PacketPtr pkt) { @@ -140,12 +77,6 @@ Apply::ApplyMemPort::recvReqRetry() blockedPacket = nullptr; } -AddrRangeList -Apply::getAddrRanges() const -{ - return memPort.getAddrRanges(); -} - bool Apply::handleWL(PacketPtr pkt){ auto queue = applyReadQueue; if (queue.blocked()){ diff --git a/src/accl/apply.hh b/src/accl/graph/base/apply.hh similarity index 78% rename from src/accl/apply.hh rename to src/accl/graph/base/apply.hh index 2a16632e22..a3f0ff5aa3 100644 --- a/src/accl/apply.hh +++ b/src/accl/graph/base/apply.hh @@ -46,7 +46,7 @@ namespace gem5 class Apply : public ClockedObject { private: - + //FIXME: Remove queue defenition from here. struct ApplyQueue{ std::queue applyQueue; const uint32_t queueSize; @@ -75,43 +75,6 @@ class Apply : public ClockedObject {} }; - class ApplyRespPort : public ResponsePort - { - private: - Apply *owner; - public: - ApplyRespPort(const std::string& name, Apply* owner): - ResponsePort(name, owner), owner(owner) - {} - virtual AddrRangeList getAddrRanges() const; - void trySendRetry(); - protected: - virtual bool recvTimingReq(PacketPtr pkt); - virtual Tick recvAtomic(PacketPtr pkt); - virtual void recvFunctional(PacketPtr pkt); - virtual void recvRespRetry(); - }; - - class ApplyReqPort : public RequestPort - { - private: - Apply *owner; - bool _blocked; - PacketPtr blockedPacket; - - public: - ApplyReqPort(const std::string& name, Apply* owner): - RequestPort(name, owner), owner(owner), - _blocked(false), blockedPacket(nullptr) - {} - void sendPacket(PacketPtr pkt); - bool blocked() { return _blocked; } - - protected: - virtual bool recvTimingResp(PacketPtr pkt); - void recvReqRetry() override; - }; - class ApplyMemPort : public RequestPort { private: @@ -134,11 +97,8 @@ class Apply : public ClockedObject void recvReqRetry() override; }; - System* const system; const RequestorID requestorId; - ApplyRespPort respPort; - ApplyReqPort reqPort; ApplyMemPort memPort; ApplyQueue applyReadQueue; @@ -168,8 +128,6 @@ class Apply : public ClockedObject Write edgelist loc in buffer */ - AddrRangeList getAddrRanges() const; - public: Apply(const ApplyParams &apply); diff --git a/src/accl/push_engine.cc b/src/accl/graph/base/push_engine.cc similarity index 100% rename from src/accl/push_engine.cc rename to src/accl/graph/base/push_engine.cc diff --git a/src/accl/push_engine.hh b/src/accl/graph/base/push_engine.hh similarity index 100% rename from src/accl/push_engine.hh rename to src/accl/graph/base/push_engine.hh diff --git a/src/accl/util.cc b/src/accl/graph/base/util.cc similarity index 100% rename from src/accl/util.cc rename to src/accl/graph/base/util.cc diff --git a/src/accl/util.hh b/src/accl/graph/base/util.hh similarity index 100% rename from src/accl/util.hh rename to src/accl/graph/base/util.hh diff --git a/src/accl/wl_engine.cc b/src/accl/graph/base/wl_engine.cc similarity index 79% rename from src/accl/wl_engine.cc rename to src/accl/graph/base/wl_engine.cc index d2ecd0d7c9..dc8f1dd744 100644 --- a/src/accl/wl_engine.cc +++ b/src/accl/graph/base/wl_engine.cc @@ -26,21 +26,17 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "accl/wl_engine.hh" +#include "accl/graph/base/wl_engine.hh" #include -#include "accl/util.hh" +#include "accl/graph/base/util.hh" namespace gem5 { WLEngine::WLEngine(const WLEngineParams ¶ms): ClockedObject(params), - system(params.system), - requestorId(system->getRequestorId(this)), - respPort(name() + ".respPort", this), - reqPort(name() + ".reqPort", this), memPort(name() + ".memPort", this), updateQueue(params.wlQueueSize), responseQueue(params.wlQueueSize), @@ -51,74 +47,13 @@ WLEngine::WLEngine(const WLEngineParams ¶ms): Port & WLEngine::getPort(const std::string &if_name, PortID idx) { - if (if_name == "reqPort") { - return reqPort; - } else if (if_name == "respPort") { - return respPort; - } else if (if_name == "memPort") { + if (if_name == "memPort") { return memPort; } else { return SimObject::getPort(if_name, idx); } } -AddrRangeList -WLEngine::WLRespPort::getAddrRanges() const -{ - return owner->getAddrRanges(); -} - -bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt) -{ - if (!owner->handleWLUpdate(pkt)){ - return false; - } - return true; -} - -Tick -WLEngine::WLRespPort::recvAtomic(PacketPtr pkt) -{ - panic("recvAtomic unimpl."); -} - -void -WLEngine::WLRespPort::recvFunctional(PacketPtr pkt) -{ - owner->recvFunctional(pkt); -} - -void -WLEngine::WLRespPort::recvRespRetry() -{ - panic("recvRespRetry from response port is called."); -} - -bool -WLEngine::WLReqPort::recvTimingResp(PacketPtr) -{ - panic("recvTimingResp called on the request port."); -} - -void -WLEngine::WLReqPort::recvReqRetry() -{ - // We should have a blocked packet if this function is called. - assert(_blocked && blockedPacket != nullptr); - _blocked = false; - sendPacket(blockedPacket); - blockedPacket = nullptr; -} - -void -WLEngine::WLReqPort::sendPacket(PacketPtr pkt) -{ - if (!sendTimingReq(pkt)) { - blockedPacket = pkt; - _blocked = true; - } -} - void WLEngine::WLMemPort::sendPacket(PacketPtr pkt) { @@ -144,18 +79,6 @@ WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt) return owner->handleMemResp(pkt); } -AddrRangeList -WLEngine::getAddrRanges() const -{ - return memPort.getAddrRanges(); -} - -void -WLEngine::recvFunctional(PacketPtr pkt) -{ - memPort.sendFunctional(pkt); -} - bool WLEngine::handleWLUpdate(PacketPtr pkt){ auto queue = updateQueue; diff --git a/src/accl/wl_engine.hh b/src/accl/graph/base/wl_engine.hh similarity index 75% rename from src/accl/wl_engine.hh rename to src/accl/graph/base/wl_engine.hh index 57cc063880..3654999b70 100644 --- a/src/accl/wl_engine.hh +++ b/src/accl/graph/base/wl_engine.hh @@ -46,7 +46,7 @@ namespace gem5 class WLEngine : public ClockedObject { private: - + //FIXME: Change this struct WLQueue{ std::queue wlQueue; uint32_t queueSize; @@ -77,44 +77,6 @@ class WLEngine : public ClockedObject sendPktRetry(false){} }; - class WLRespPort : public ResponsePort //From Push engine - { - private: - WLEngine *owner; - - public: - WLRespPort(const std::string& name, WLEngine* owner): - ResponsePort(name, owner), owner(owner) - {} - virtual AddrRangeList getAddrRanges() const; - - protected: - virtual bool recvTimingReq(PacketPtr pkt); - virtual Tick recvAtomic(PacketPtr pkt); - virtual void recvFunctional(PacketPtr pkt); - virtual void recvRespRetry(); - }; - - class WLReqPort : public RequestPort //To Apply Engine - { - private: - WLEngine *owner; - bool _blocked; - PacketPtr blockedPacket; - - public: - WLReqPort(const std::string& name, WLEngine* owner): - RequestPort(name, owner), owner(owner), - _blocked(false), blockedPacket(nullptr) - {} - void sendPacket(PacketPtr pkt); - bool blocked() { return _blocked; } - - protected: - void recvReqRetry() override; - virtual bool recvTimingResp(PacketPtr pkt); - }; - class WLMemPort : public RequestPort { private: @@ -135,11 +97,6 @@ class WLEngine : public ClockedObject void recvReqRetry() override; }; - System* const system; - const RequestorID requestorId; - - WLRespPort respPort; - WLReqPort reqPort; WLMemPort memPort; WLQueue updateQueue; @@ -164,10 +121,6 @@ class WLEngine : public ClockedObject Write edgelist loc in buffer */ - AddrRangeList getAddrRanges() const; - - void recvFunctional(PacketPtr pkt); - public: WLEngine(const WLEngineParams ¶ms); diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh new file mode 100644 index 0000000000..e69de29bb2 From d02f3824f8f6fc41ae6cff87bfccff497405d78a Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Sun, 20 Feb 2022 09:59:09 -0800 Subject: [PATCH 035/247] Restructing the classes. --- src/accl/graph/base/Apply.py | 5 +--- src/accl/graph/base/PushEngine.py | 3 -- src/accl/graph/base/WLEngine.py | 5 +--- src/accl/graph/sega/MPU.py | 46 +++++++++++++++++++++++++++++++ 4 files changed, 48 insertions(+), 11 deletions(-) create mode 100644 src/accl/graph/sega/MPU.py diff --git a/src/accl/graph/base/Apply.py b/src/accl/graph/base/Apply.py index 8720287cc8..80aa430139 100644 --- a/src/accl/graph/base/Apply.py +++ b/src/accl/graph/base/Apply.py @@ -34,8 +34,5 @@ class Apply(ClockedObject): cxx_header = "accl/apply.hh" cxx_class = 'gem5::Apply' - system = Param.System(Parent.any, "The system object this apply engine is a part of") - respPort = ResponsePort("Receives requests from WorkList") - reqPort = RequestPort("Sends requests to Push") - memPort = RequestPort("Memory side port, sends requests") applyQueueSize = Param.Unsigned(32, "Size of write queue") + memPort = RequestPort("Memory side port, sends requests") diff --git a/src/accl/graph/base/PushEngine.py b/src/accl/graph/base/PushEngine.py index 840d8dea1f..7fef165169 100644 --- a/src/accl/graph/base/PushEngine.py +++ b/src/accl/graph/base/PushEngine.py @@ -34,7 +34,4 @@ class PushEngine(ClockedObject): cxx_header = "accl/push_engine.hh" cxx_class = 'gem5::PushEngine' - system = Param.System(Parent.any, "The system object this push engine is a part of") - respPort = ResponsePort("Port to Receive updates from outside") - reqPort = RequestPort("Port to send updates to the outside") memPort = RequestPort("Port to communicate with the memory") diff --git a/src/accl/graph/base/WLEngine.py b/src/accl/graph/base/WLEngine.py index 562fd04423..deaee20935 100644 --- a/src/accl/graph/base/WLEngine.py +++ b/src/accl/graph/base/WLEngine.py @@ -34,8 +34,5 @@ class WLEngine(ClockedObject): cxx_header = "accl/wl_engine.hh" cxx_class = 'gem5::WLEngine' - system = Param.System(Parent.any, "The system object this push WorkList is a part of") - respPort = ResponsePort("Receives updates") - reqPort = RequestPort("Sends requests to Apply") - memPort = RequestPort("Memory side port, sends requests") wlQueueSize = Param.Unsigned(32, "Size of write queue") + memPort = RequestPort("Memory side port, sends requests") diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py new file mode 100644 index 0000000000..b6e136dda5 --- /dev/null +++ b/src/accl/graph/sega/MPU.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject +# FIXME: update these to correct files +from m5.objects.WLEngine import WLEngine +from m5.objects.PushEngine import PushEngine +from m5.objects.ApplyEngine import ApplyEngine + +class MPU(ClockedObject): + type = 'MPU' + cxx_header = "accl/graph/sega/mpu.hh" + cxx_class = 'gem5::MPU' + + workListEngine = Param.WLEngine("WLEngine object to connect to " + "This MPU") + applyEngine = Param.ApplyEngine("ApplyEngine object to connect to " + "This MPU") + pushEngine = Param.PushEngine("PushEngine object to connect to " + "This MPU") From bfb12794aa99858bb88afab45640cc27c90bde76 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Sun, 20 Feb 2022 11:12:50 -0800 Subject: [PATCH 036/247] Sperating WLEngine and BaseWLEngine + few changes in BaseApplyEngine --- .../base/{Apply.py => BaseApplyEngine.py} | 0 src/accl/graph/base/BaseWLEngine.py | 38 ++++++++++++++++++ .../base/{apply.cc => base_apply_engine.cc} | 20 +++++----- .../base/{apply.hh => base_apply_engine.hh} | 35 +++++----------- .../base/{wl_engine.cc => base_wl_engine.cc} | 20 +++++----- .../base/{wl_engine.hh => base_wl_engine.hh} | 13 +++--- src/accl/graph/sega/ApplyEngine.py | 40 +++++++++++++++++++ src/accl/graph/{base => sega}/WLEngine.py | 12 +++--- src/accl/graph/sega/apply_engine.cc | 0 src/accl/graph/sega/apply_engine.hh | 0 src/accl/graph/sega/wl_engine.cc | 0 src/accl/graph/sega/wl_engine.hh | 0 12 files changed, 120 insertions(+), 58 deletions(-) rename src/accl/graph/base/{Apply.py => BaseApplyEngine.py} (100%) create mode 100644 src/accl/graph/base/BaseWLEngine.py rename src/accl/graph/base/{apply.cc => base_apply_engine.cc} (91%) rename src/accl/graph/base/{apply.hh => base_apply_engine.hh} (79%) rename src/accl/graph/base/{wl_engine.cc => base_wl_engine.cc} (91%) rename src/accl/graph/base/{wl_engine.hh => base_wl_engine.hh} (93%) create mode 100644 src/accl/graph/sega/ApplyEngine.py rename src/accl/graph/{base => sega}/WLEngine.py (84%) create mode 100644 src/accl/graph/sega/apply_engine.cc create mode 100644 src/accl/graph/sega/apply_engine.hh create mode 100644 src/accl/graph/sega/wl_engine.cc create mode 100644 src/accl/graph/sega/wl_engine.hh diff --git a/src/accl/graph/base/Apply.py b/src/accl/graph/base/BaseApplyEngine.py similarity index 100% rename from src/accl/graph/base/Apply.py rename to src/accl/graph/base/BaseApplyEngine.py diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py new file mode 100644 index 0000000000..7384e876ef --- /dev/null +++ b/src/accl/graph/base/BaseWLEngine.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject + +class BaseWLEngine(ClockedObject): + type = 'BaseWLEngine' + cxx_header = "accl/base_wl_engine.hh" + cxx_class = 'gem5::BaseWLEngine' + + wlQueueSize = Param.Unsigned(32, "Size of write queue") + memPort = RequestPort("Memory side port, sends requests") diff --git a/src/accl/graph/base/apply.cc b/src/accl/graph/base/base_apply_engine.cc similarity index 91% rename from src/accl/graph/base/apply.cc rename to src/accl/graph/base/base_apply_engine.cc index eae9c2fd16..c88d14a2c2 100644 --- a/src/accl/graph/base/apply.cc +++ b/src/accl/graph/base/base_apply_engine.cc @@ -26,7 +26,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "accl/apply.hh" +#include "accl/base_apply_engine.hh" #include @@ -35,7 +35,7 @@ namespace gem5 { -Apply::Apply(const ApplyParams ¶ms): +BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams ¶ms): ClockedObject(params), memPort(name() + ".memPort", this), applyReadQueue(params.applyQueueSize), @@ -45,7 +45,7 @@ Apply::Apply(const ApplyParams ¶ms): {} Port & -Apply::getPort(const std::string &if_name, PortID idx) +BaseApplyEngine::getPort(const std::string &if_name, PortID idx) { if (if_name == "memPort") { return memPort; @@ -55,7 +55,7 @@ Apply::getPort(const std::string &if_name, PortID idx) } void -Apply::ApplyMemPort::sendPacket(PacketPtr pkt) +BaseApplyEngine::ApplyMemPort::sendPacket(PacketPtr pkt) { if (!sendTimingReq(pkt)) { blockedPacket = pkt; @@ -64,20 +64,20 @@ Apply::ApplyMemPort::sendPacket(PacketPtr pkt) } bool -Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt) +BaseApplyEngine::ApplyMemPort::recvTimingResp(PacketPtr pkt) { return owner->handleMemResp(pkt); } void -Apply::ApplyMemPort::recvReqRetry() +BaseApplyEngine::ApplyMemPort::recvReqRetry() { _blocked = false; sendPacket(blockedPacket); blockedPacket = nullptr; } -bool Apply::handleWL(PacketPtr pkt){ +bool BaseApplyEngine::handleWL(PacketPtr pkt){ auto queue = applyReadQueue; if (queue.blocked()){ queue.sendPktRetry = true; @@ -91,7 +91,7 @@ bool Apply::handleWL(PacketPtr pkt){ return true; } -void Apply::processNextApplyCheckEvent(){ +void BaseApplyEngine::processNextApplyCheckEvent(){ auto queue = applyReadQueue; if (!memPort.blocked()){ PacketPtr pkt = queue.front(); @@ -114,7 +114,7 @@ void Apply::processNextApplyCheckEvent(){ } bool -Apply::handleMemResp(PacketPtr pkt) +BaseApplyEngine::handleMemResp(PacketPtr pkt) { auto queue = applyWriteQueue; @@ -132,7 +132,7 @@ Apply::handleMemResp(PacketPtr pkt) } void -Apply::processNextApplyEvent(){ +BaseApplyEngine::processNextApplyEvent(){ auto queue = applyWriteQueue; PacketPtr pkt = queue.front(); uint8_t* data = pkt->getPtr(); diff --git a/src/accl/graph/base/apply.hh b/src/accl/graph/base/base_apply_engine.hh similarity index 79% rename from src/accl/graph/base/apply.hh rename to src/accl/graph/base/base_apply_engine.hh index a3f0ff5aa3..c2d2f26387 100644 --- a/src/accl/graph/base/apply.hh +++ b/src/accl/graph/base/base_apply_engine.hh @@ -32,18 +32,16 @@ #include #include -#include "base/addr_range.hh" #include "mem/packet.hh" #include "mem/port.hh" -#include "params/Apply.hh" +#include "params/BaseApplyEngine.hh" #include "sim/clocked_object.hh" #include "sim/port.hh" -#include "sim/system.hh" namespace gem5 { -class Apply : public ClockedObject +class BaseApplyEngine : public ClockedObject { private: //FIXME: Remove queue defenition from here. @@ -75,21 +73,20 @@ class Apply : public ClockedObject {} }; - class ApplyMemPort : public RequestPort + class MemPort : public RequestPort { private: - Apply *owner; + BaseApplyEngine *owner; bool _blocked; PacketPtr blockedPacket; public: - ApplyMemPort(const std::string& name, Apply* owner): + MemPort(const std::string& name, BaseApplyEngine* owner): RequestPort(name, owner), owner(owner), _blocked(false), blockedPacket(nullptr) {} void sendPacket(PacketPtr pkt); - // void trySendRetry(); bool blocked(){ return _blocked;} protected: @@ -99,7 +96,7 @@ class Apply : public ClockedObject const RequestorID requestorId; - ApplyMemPort memPort; + MemPort memPort; ApplyQueue applyReadQueue; ApplyQueue applyWriteQueue; @@ -107,29 +104,15 @@ class Apply : public ClockedObject std::unordered_map requestOffset; bool handleWL(PacketPtr pkt); - // bool sendPacket(); - // //one queue for write and one for read a priotizes write over read - // void readApplyBuffer(); - bool handleMemResp(PacketPtr resp); - // void writePushBuffer(); - - //Events EventFunctionWrapper nextApplyCheckEvent; void processNextApplyCheckEvent(); - /* Syncronously checked - If there are any active vertecies: - create memory read packets + MPU::MPU::MemPortsendTimingReq - */ + + bool handleMemResp(PacketPtr resp); EventFunctionWrapper nextApplyEvent; void processNextApplyEvent(); - /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp - Perform apply and send the write request and read edgeList - read + write - Write edgelist loc in buffer - */ public: - Apply(const ApplyParams &apply); + BaseApplyEngine(const ApplyParams &apply); Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; diff --git a/src/accl/graph/base/wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc similarity index 91% rename from src/accl/graph/base/wl_engine.cc rename to src/accl/graph/base/base_wl_engine.cc index dc8f1dd744..7261069c17 100644 --- a/src/accl/graph/base/wl_engine.cc +++ b/src/accl/graph/base/base_wl_engine.cc @@ -26,7 +26,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "accl/graph/base/wl_engine.hh" +#include "accl/graph/base/base_wl_engine.hh" #include @@ -35,7 +35,7 @@ namespace gem5 { -WLEngine::WLEngine(const WLEngineParams ¶ms): +BaseWLEngine::BaseWLEngine(const BaseWLEngineParams ¶ms): ClockedObject(params), memPort(name() + ".memPort", this), updateQueue(params.wlQueueSize), @@ -45,7 +45,7 @@ WLEngine::WLEngine(const WLEngineParams ¶ms): {} Port & -WLEngine::getPort(const std::string &if_name, PortID idx) +BaseWLEngine::getPort(const std::string &if_name, PortID idx) { if (if_name == "memPort") { return memPort; @@ -55,7 +55,7 @@ WLEngine::getPort(const std::string &if_name, PortID idx) } void -WLEngine::WLMemPort::sendPacket(PacketPtr pkt) +BaseWLEngine::WLMemPort::sendPacket(PacketPtr pkt) { if (!sendTimingReq(pkt)) { blockedPacket = pkt; @@ -64,7 +64,7 @@ WLEngine::WLMemPort::sendPacket(PacketPtr pkt) } void -WLEngine::WLMemPort::recvReqRetry() +BaseWLEngine::WLMemPort::recvReqRetry() { // We should have a blocked packet if this function is called. assert(_blocked && blockedPacket != nullptr); @@ -74,13 +74,13 @@ WLEngine::WLMemPort::recvReqRetry() } bool -WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt) +BaseWLEngine::WLMemPort::recvTimingResp(PacketPtr pkt) { return owner->handleMemResp(pkt); } bool -WLEngine::handleWLUpdate(PacketPtr pkt){ +BaseWLEngine::handleWLUpdate(PacketPtr pkt){ auto queue = updateQueue; if (queue.blocked()){ queue.sendPktRetry = true; @@ -94,7 +94,7 @@ WLEngine::handleWLUpdate(PacketPtr pkt){ return true; } -void WLEngine::processNextWLReadEvent(){ +void BaseWLEngine::processNextWLReadEvent(){ auto queue = updateQueue; while (!queue.empty()){ //create a map instead of front PacketPtr pkt = queue.front(); @@ -117,7 +117,7 @@ void WLEngine::processNextWLReadEvent(){ } bool -WLEngine::handleMemResp(PacketPtr pkt) +BaseWLEngine::handleMemResp(PacketPtr pkt) { auto queue = responseQueue; if (queue.blocked()){ @@ -134,7 +134,7 @@ WLEngine::handleMemResp(PacketPtr pkt) } void -WLEngine::processNextWLReduceEvent(){ +BaseWLEngine::processNextWLReduceEvent(){ auto queue = responseQueue; auto updateQ = updateQueue; auto applyPort = reqPort; diff --git a/src/accl/graph/base/wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh similarity index 93% rename from src/accl/graph/base/wl_engine.hh rename to src/accl/graph/base/base_wl_engine.hh index 3654999b70..2095a20f1b 100644 --- a/src/accl/graph/base/wl_engine.hh +++ b/src/accl/graph/base/base_wl_engine.hh @@ -35,7 +35,7 @@ #include "base/addr_range.hh" #include "mem/port.hh" #include "mem/packet.hh" -#include "params/WLEngine.hh" +#include "params/BaseWLEngine.hh" #include "sim/clocked_object.hh" #include "sim/port.hh" #include "sim/system.hh" @@ -43,7 +43,7 @@ namespace gem5 { -class WLEngine : public ClockedObject +class BaseWLEngine : public ClockedObject { private: //FIXME: Change this @@ -77,7 +77,7 @@ class WLEngine : public ClockedObject sendPktRetry(false){} }; - class WLMemPort : public RequestPort + class MemPort : public RequestPort { private: WLEngine *owner; @@ -85,7 +85,7 @@ class WLEngine : public ClockedObject PacketPtr blockedPacket; public: - WLMemPort(const std::string& name, WLEngine* owner): + MemPort(const std::string& name, WLEngine* owner): RequestPort(name, owner), owner(owner), _blocked(false), blockedPacket(nullptr) {} @@ -97,8 +97,7 @@ class WLEngine : public ClockedObject void recvReqRetry() override; }; - WLMemPort memPort; - + MemPort memPort; WLQueue updateQueue; WLQueue responseQueue; @@ -122,7 +121,7 @@ class WLEngine : public ClockedObject */ public: - WLEngine(const WLEngineParams ¶ms); + BaseWLEngine(const BaseWLEngineParams ¶ms); Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/ApplyEngine.py new file mode 100644 index 0000000000..0d03e71e54 --- /dev/null +++ b/src/accl/graph/sega/ApplyEngine.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from build.NULL.python.m5.proxy import Parent +from m5.params import * +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject +# FIXME: update these to correct files +from m5.objects.BaseApplyEngine import BaseApplyEngine + +class ApplyEngine(BaseApplyEngine): + type = 'ApplyEngine' + cxx_header = "accl/graph/sega/apply_engine.hh" + cxx_class = 'gem5::MPU' + + mpu = Param.MPU(Parent, "MPU object that owns this ApplyEngine") diff --git a/src/accl/graph/base/WLEngine.py b/src/accl/graph/sega/WLEngine.py similarity index 84% rename from src/accl/graph/base/WLEngine.py rename to src/accl/graph/sega/WLEngine.py index deaee20935..a8f3bd20ea 100644 --- a/src/accl/graph/base/WLEngine.py +++ b/src/accl/graph/sega/WLEngine.py @@ -25,14 +25,16 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +from build.NULL.python.m5.proxy import Parent from m5.params import * from m5.proxy import * from m5.objects.ClockedObject import ClockedObject +# FIXME: update these to correct files +from m5.objects.BaseWLEngine import BaseWLEngine -class WLEngine(ClockedObject): +class WLEngine(BaseWLEngine): type = 'WLEngine' - cxx_header = "accl/wl_engine.hh" - cxx_class = 'gem5::WLEngine' + cxx_header = "accl/graph/sega/wl_engine.hh" + cxx_class = 'gem5::MPU' - wlQueueSize = Param.Unsigned(32, "Size of write queue") - memPort = RequestPort("Memory side port, sends requests") + mpu = Param.MPU(Parent, "MPU object that owns this WLEngine") \ No newline at end of file diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh new file mode 100644 index 0000000000..e69de29bb2 From bfdec933f77713641144d1a2bd4fa1c4aec53faa Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 20 Feb 2022 11:25:17 -0800 Subject: [PATCH 037/247] Restructuring classes. --- src/accl/graph/base/BasePushEngine.py | 37 ++++++ src/accl/graph/base/SConscript | 4 +- .../{push_engine.cc => base_push_engine.cc} | 125 +++++------------- .../{push_engine.hh => base_push_engine.hh} | 66 ++------- src/accl/graph/sega/MPU.py | 2 +- src/accl/graph/{base => sega}/PushEngine.py | 14 +- src/accl/graph/sega/push_engine.cc | 0 src/accl/graph/sega/push_engine.hh | 0 8 files changed, 90 insertions(+), 158 deletions(-) create mode 100644 src/accl/graph/base/BasePushEngine.py rename src/accl/graph/base/{push_engine.cc => base_push_engine.cc} (77%) rename src/accl/graph/base/{push_engine.hh => base_push_engine.hh} (66%) rename src/accl/graph/{base => sega}/PushEngine.py (83%) create mode 100644 src/accl/graph/sega/push_engine.cc create mode 100644 src/accl/graph/sega/push_engine.hh diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py new file mode 100644 index 0000000000..6ed5d25978 --- /dev/null +++ b/src/accl/graph/base/BasePushEngine.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject + +class BasePushEngine(ClockedObject): + type = 'BasePushEngine' + cxx_header = "accl/graph/base/base_push_engine.hh" + cxx_class = 'gem5::BasePushEngine' + + memPort = RequestPort("Port to communicate with the memory") diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript index 18ac71eb7d..a881fa1e6e 100644 --- a/src/accl/graph/base/SConscript +++ b/src/accl/graph/base/SConscript @@ -28,10 +28,10 @@ Import('*') SimObject('Apply.py') -SimObject('PushEngine.py') +SimObject('BasePushEngine.py') SimObject('WLEngine.py') Source('apply.cc') -Source('push_engine.cc') +Source('base_push_engine.cc') Source('wl_engine.cc') Source('util.cc') diff --git a/src/accl/graph/base/push_engine.cc b/src/accl/graph/base/base_push_engine.cc similarity index 77% rename from src/accl/graph/base/push_engine.cc rename to src/accl/graph/base/base_push_engine.cc index 125433653b..9fbc89221f 100644 --- a/src/accl/graph/base/push_engine.cc +++ b/src/accl/graph/base/base_push_engine.cc @@ -26,18 +26,15 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "accl/push_engine.hh" +#include "accl/graph/base/base_push_engine.hh" -#include "accl/util.hh" +#include "accl/graph/base/util.hh" namespace gem5 { -PushEngine::PushEngine(const PushEngineParams ¶ms) : ClockedObject(params), - system(params.system), - requestorId(system->getRequestorId(this)), - reqPort(name() + ".reqPort", this), - respPort(name() + ".respPort", this), +BasePushEngine::BasePushEngine(const BasePushEngine ¶ms) : ClockedObject(params), + requestorId(0), memPort(name() + ".memPort", this), // vertexQueueSize(params.vertex_queue_size), // vertexQueueLen(0), @@ -50,21 +47,29 @@ PushEngine::PushEngine(const PushEngineParams ¶ms) : ClockedObject(params), } Port & -PushEngine::getPort(const std::string &if_name, PortID idx) +BasePushEngine::getPort(const std::string &if_name, PortID idx) { - if (if_name == "reqPort") { - return reqPort; - } else if (if_name == "respPort") { - return respPort; - } else if (if_name == "memPort") { + if (if_name == "memPort") { return memPort; } else { return SimObject::getPort(if_name, idx); } } +RequestorID +BasePushEngine::getRequestorId() +{ + return requestorId; +} + +void +BasePushEngine::setRequestorId(RequestorID requestorId) +{ + this->requestorId = requestorId; +} + void -PushEngine::startup() +BasePushEngine::startup() { //FIXME: This is the current version of our initializer. // This should be updated in the future. @@ -99,75 +104,14 @@ PushEngine::startup() } -AddrRangeList -PushEngine::PushRespPort::getAddrRanges() const -{ - return owner->getAddrRanges(); -} - -bool -PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt) -{ - return owner->handleUpdate(pkt); -} - -Tick -PushEngine::PushRespPort::recvAtomic(PacketPtr pkt) -{ - panic("recvAtomic unimpl."); -} - -void -PushEngine::PushRespPort::recvFunctional(PacketPtr pkt) -{ - owner->recvFunctional(pkt); -} - -void -PushEngine::PushRespPort::recvRespRetry() -{ - panic("recvRespRetry from response port is called."); -} - -void -PushEngine::PushReqPort::sendPacket(PacketPtr pkt) -{ - panic_if(_blocked, "Should never try to send if blocked MemSide!"); - // If we can't send the packet across the port, store it for later. - if (!sendTimingReq(pkt)) - { - blockedPacket = pkt; - _blocked = true; - } -} - -bool -PushEngine::PushReqPort::recvTimingResp(PacketPtr pkt) -{ - panic("recvTimingResp called on the request port."); -} - -void -PushEngine::PushReqPort::recvReqRetry() -{ - panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); - - _blocked = false; - sendPacket(blockedPacket); - - if (!blocked()) { - blockedPacket = nullptr; - } -} - bool -PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt) +BasePushEngine::MemPort::recvTimingResp(PacketPtr pkt) { return owner->handleMemResp(pkt); } void -PushEngine::PushMemPort::sendPacket(PacketPtr pkt) +BasePushEngine::MemPort::sendPacket(PacketPtr pkt) { panic_if(_blocked, "Should never try to send if blocked MemSide!"); // If we can't send the packet across the port, store it for later. @@ -179,7 +123,7 @@ PushEngine::PushMemPort::sendPacket(PacketPtr pkt) } void -PushEngine::PushMemPort::recvReqRetry() +BasePushEngine::MemPort::recvReqRetry() { panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); @@ -191,20 +135,8 @@ PushEngine::PushMemPort::recvReqRetry() } } -AddrRangeList -PushEngine::getAddrRanges() -{ - return memPort.getAddrRanges(); -} - -void -PushEngine::recvFunctional(PacketPtr pkt) -{ - memPort.sendFunctional(pkt); -} - bool -PushEngine::handleUpdate(PacketPtr pkt) +BasePushEngine::handleUpdate(PacketPtr pkt) { //FIXME: There should be a check if the queues are full. // if (vertexQueueLen < vertexQueueSize) { @@ -223,7 +155,8 @@ PushEngine::handleUpdate(PacketPtr pkt) return true; } -void PushEngine::processNextReceiveEvent() +void +BasePushEngine::processNextReceiveEvent() { PacketPtr updatePkt = vertexQueue.front(); uint8_t* data = updatePkt->getPtr(); @@ -274,7 +207,8 @@ void PushEngine::processNextReceiveEvent() } } -void PushEngine::processNextReadEvent() +void +BasePushEngine::processNextReadEvent() { PacketPtr pkt = memReqQueue.front(); if (!memPort.blocked()) { @@ -288,7 +222,7 @@ void PushEngine::processNextReadEvent() } bool -PushEngine::handleMemResp(PacketPtr pkt) +BasePushEngine::handleMemResp(PacketPtr pkt) { RequestPtr req = pkt->req; uint8_t *data = pkt->getPtr(); @@ -321,7 +255,8 @@ PushEngine::handleMemResp(PacketPtr pkt) return true; } -void PushEngine::processNextSendEvent() +void +BasePushEngine::processNextSendEvent() { PacketPtr pkt = updateQueue.front(); if (!reqPort.blocked()) { diff --git a/src/accl/graph/base/push_engine.hh b/src/accl/graph/base/base_push_engine.hh similarity index 66% rename from src/accl/graph/base/push_engine.hh rename to src/accl/graph/base/base_push_engine.hh index fbb7d6915a..591f4ab734 100644 --- a/src/accl/graph/base/push_engine.hh +++ b/src/accl/graph/base/base_push_engine.hh @@ -31,67 +31,27 @@ #include -#include "base/addr_range.hh" #include "mem/port.hh" #include "mem/packet.hh" -#include "params/PushEngine.hh" +#include "params/BasePushEngine.hh" #include "sim/clocked_object.hh" -#include "sim/system.hh" namespace gem5 { -class PushEngine : public ClockedObject +class BasePushEngine : public ClockedObject { private: - class PushRespPort : public ResponsePort + class MemPort : public RequestPort { private: - PushEngine* owner; - - public: - PushRespPort(const std::string& name, PushEngine* owner): - ResponsePort(name, owner), owner(owner) - {} - virtual AddrRangeList getAddrRanges() const; - - protected: - virtual bool recvTimingReq(PacketPtr pkt); - virtual Tick recvAtomic(PacketPtr pkt); - virtual void recvFunctional(PacketPtr pkt); - virtual void recvRespRetry(); - }; - - class PushReqPort : public RequestPort - { - private: - PushEngine* owner; + BasePushEngine* owner; bool _blocked; PacketPtr blockedPacket; public: - PushReqPort(const std::string& name, PushEngine* owner): - RequestPort(name, owner), owner(owner), - _blocked(false), blockedPacket(nullptr) - {} - void sendPacket(PacketPtr pkt); - bool blocked() { return _blocked; } - - protected: - virtual bool recvTimingResp(PacketPtr pkt); - virtual void recvReqRetry(); - }; - - class PushMemPort : public RequestPort - { - private: - PushEngine* owner; - bool _blocked; - PacketPtr blockedPacket; - - public: - PushMemPort(const std::string& name, PushEngine* owner): + MemPort(const std::string& name, PushEngine* owner): RequestPort(name, owner), owner(owner), _blocked(false), blockedPacket(nullptr) {} @@ -106,13 +66,9 @@ class PushEngine : public ClockedObject virtual void startup() override; - System* const system; - const RequestorID requestorId; + RequestorID requestorId; - PushReqPort reqPort; - PushRespPort respPort; - - PushMemPort memPort; + MemPort memPort; std::queue vertexQueue; // int vertexQueueSize; @@ -128,9 +84,6 @@ class PushEngine : public ClockedObject // int updateQueueSize; // int updateQueueLen; - AddrRangeList getAddrRanges(); - void recvFunctional(PacketPtr pkt); - bool handleUpdate(PacketPtr pkt); EventFunctionWrapper nextReceiveEvent; void processNextReceiveEvent(); @@ -144,11 +97,14 @@ class PushEngine : public ClockedObject public: - PushEngine(const PushEngineParams ¶ms); + BasePushEngine(const PushEngineParams ¶ms); Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; + RequestorID getRequestorId(); + void setRequestorId(RequestorId requestorId); + }; } diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py index b6e136dda5..923c1a2f38 100644 --- a/src/accl/graph/sega/MPU.py +++ b/src/accl/graph/sega/MPU.py @@ -28,7 +28,7 @@ from m5.params import * from m5.proxy import * from m5.objects.ClockedObject import ClockedObject -# FIXME: update these to correct files + from m5.objects.WLEngine import WLEngine from m5.objects.PushEngine import PushEngine from m5.objects.ApplyEngine import ApplyEngine diff --git a/src/accl/graph/base/PushEngine.py b/src/accl/graph/sega/PushEngine.py similarity index 83% rename from src/accl/graph/base/PushEngine.py rename to src/accl/graph/sega/PushEngine.py index 7fef165169..fa9d921a26 100644 --- a/src/accl/graph/base/PushEngine.py +++ b/src/accl/graph/sega/PushEngine.py @@ -29,9 +29,13 @@ from m5.proxy import * from m5.objects.ClockedObject import ClockedObject -class PushEngine(ClockedObject): - type = 'PushEngine' - cxx_header = "accl/push_engine.hh" - cxx_class = 'gem5::PushEngine' +from m5.objects.WLEngine import WLEngine +from m5.objects.PushEngine import PushEngine +from m5.objects.ApplyEngine import ApplyEngine - memPort = RequestPort("Port to communicate with the memory") +class MPU(ClockedObject): + type = 'MPU' + cxx_header = "accl/graph/sega/mpu.hh" + cxx_class = 'gem5::MPU' + + mpu = Param.MPU(Parent, "The MPU object than owns this PushEngine.") diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc new file mode 100644 index 0000000000..e69de29bb2 diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh new file mode 100644 index 0000000000..e69de29bb2 From 3f798dfd17a1ec8087fcdd6c904ae1e8777c91c1 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Sun, 20 Feb 2022 11:34:28 -0800 Subject: [PATCH 038/247] Adding RequestorID --- src/accl/graph/base/base_apply_engine.cc | 13 +++++++++++++ src/accl/graph/base/base_apply_engine.hh | 3 +++ src/accl/graph/base/base_wl_engine.cc | 13 +++++++++++++ src/accl/graph/base/base_wl_engine.hh | 4 ++++ 4 files changed, 33 insertions(+) diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc index c88d14a2c2..111ea16f2e 100644 --- a/src/accl/graph/base/base_apply_engine.cc +++ b/src/accl/graph/base/base_apply_engine.cc @@ -37,6 +37,7 @@ namespace gem5 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams ¶ms): ClockedObject(params), + requestorId(0), memPort(name() + ".memPort", this), applyReadQueue(params.applyQueueSize), applyWriteQueue(params.applyQueueSize), @@ -54,6 +55,18 @@ BaseApplyEngine::getPort(const std::string &if_name, PortID idx) } } +RequestorID +BaseApplyEngine::getRequestorId() +{ + return requestorId; +} + +void +BaseApplyEngine::setRequestorId(RequestorID requestorId) +{ + this->requestorId = requestorId; +} + void BaseApplyEngine::ApplyMemPort::sendPacket(PacketPtr pkt) { diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh index c2d2f26387..3304e58a92 100644 --- a/src/accl/graph/base/base_apply_engine.hh +++ b/src/accl/graph/base/base_apply_engine.hh @@ -116,6 +116,9 @@ class BaseApplyEngine : public ClockedObject Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; + + RequestorID getRequestorId(); + void setRequestorId(RequestorId requestorId); }; } diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc index 7261069c17..dec37636ba 100644 --- a/src/accl/graph/base/base_wl_engine.cc +++ b/src/accl/graph/base/base_wl_engine.cc @@ -37,6 +37,7 @@ namespace gem5 BaseWLEngine::BaseWLEngine(const BaseWLEngineParams ¶ms): ClockedObject(params), + requestorId(0), memPort(name() + ".memPort", this), updateQueue(params.wlQueueSize), responseQueue(params.wlQueueSize), @@ -54,6 +55,18 @@ BaseWLEngine::getPort(const std::string &if_name, PortID idx) } } +RequestorID +BaseWLEngine::getRequestorId() +{ + return requestorId; +} + +void +BaseWLEngine::setRequestorId(RequestorID requestorId) +{ + this->requestorId = requestorId; +} + void BaseWLEngine::WLMemPort::sendPacket(PacketPtr pkt) { diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh index 2095a20f1b..a63d9b1ef7 100644 --- a/src/accl/graph/base/base_wl_engine.hh +++ b/src/accl/graph/base/base_wl_engine.hh @@ -97,6 +97,7 @@ class BaseWLEngine : public ClockedObject void recvReqRetry() override; }; + RequestorID requestorId; MemPort memPort; WLQueue updateQueue; WLQueue responseQueue; @@ -125,6 +126,9 @@ class BaseWLEngine : public ClockedObject Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; + + RequestorID getRequestorId(); + void setRequestorId(RequestorId requestorId); }; } From d8680eeef1505fb937c7e1ddc8f37681669f46e5 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 20 Feb 2022 13:01:19 -0800 Subject: [PATCH 039/247] Definining MPU interfaces. --- src/accl/graph/base/base_push_engine.cc | 35 +---- src/accl/graph/base/base_push_engine.hh | 24 ---- src/accl/graph/base/base_wl_engine.hh | 1 + src/accl/graph/sega/mpu.cc | 183 ++++++++++++++++++++++++ src/accl/graph/sega/mpu.hh | 134 +++++++++++++++++ src/mem/packet.hh | 3 + 6 files changed, 322 insertions(+), 58 deletions(-) create mode 100644 src/accl/graph/sega/mpu.cc diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc index 9fbc89221f..c4388cab4b 100644 --- a/src/accl/graph/base/base_push_engine.cc +++ b/src/accl/graph/base/base_push_engine.cc @@ -49,11 +49,7 @@ BasePushEngine::BasePushEngine(const BasePushEngine ¶ms) : ClockedObject(par Port & BasePushEngine::getPort(const std::string &if_name, PortID idx) { - if (if_name == "memPort") { - return memPort; - } else { - return SimObject::getPort(if_name, idx); - } + return SimObject::getPort(if_name, idx); } RequestorID @@ -104,36 +100,7 @@ BasePushEngine::startup() } -bool -BasePushEngine::MemPort::recvTimingResp(PacketPtr pkt) -{ - return owner->handleMemResp(pkt); -} -void -BasePushEngine::MemPort::sendPacket(PacketPtr pkt) -{ - panic_if(_blocked, "Should never try to send if blocked MemSide!"); - // If we can't send the packet across the port, store it for later. - if (!sendTimingReq(pkt)) - { - blockedPacket = pkt; - _blocked = true; - } -} - -void -BasePushEngine::MemPort::recvReqRetry() -{ - panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); - - _blocked = false; - sendPacket(blockedPacket); - - if (!blocked()) { - blockedPacket = nullptr; - } -} bool BasePushEngine::handleUpdate(PacketPtr pkt) diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh index 591f4ab734..2265bb32db 100644 --- a/src/accl/graph/base/base_push_engine.hh +++ b/src/accl/graph/base/base_push_engine.hh @@ -42,34 +42,10 @@ namespace gem5 class BasePushEngine : public ClockedObject { private: - - class MemPort : public RequestPort - { - private: - BasePushEngine* owner; - bool _blocked; - PacketPtr blockedPacket; - - public: - MemPort(const std::string& name, PushEngine* owner): - RequestPort(name, owner), owner(owner), - _blocked(false), blockedPacket(nullptr) - {} - - void sendPacket(PacketPtr pkt); - bool blocked() { return _blocked; } - - protected: - virtual bool recvTimingResp(PacketPtr pkt); - virtual void recvReqRetry(); - }; - virtual void startup() override; RequestorID requestorId; - MemPort memPort; - std::queue vertexQueue; // int vertexQueueSize; // int vertexQueueLen; diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh index a63d9b1ef7..3a683bb6e4 100644 --- a/src/accl/graph/base/base_wl_engine.hh +++ b/src/accl/graph/base/base_wl_engine.hh @@ -105,6 +105,7 @@ class BaseWLEngine : public ClockedObject std::unordered_map requestOffset; //Events + //FIXME: make handleWLUpdate public bool handleWLUpdate(PacketPtr pkt); EventFunctionWrapper nextWLReadEvent; void processNextWLReadEvent(); diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc new file mode 100644 index 0000000000..c45ad78ef9 --- /dev/null +++ b/src/accl/graph/sega/mpu.cc @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/mpu.hh" + +void +MPU::startup() +{ + if (((int16_t) applyEngine->getRequestorId) == -1) { + applyEngine->setRequestorId(nextRequestorId++); + } + if (((int16_t) pushEngine->getRequestorId) == -1) { + pushEngine->setRequestorId(nextRequestorId++); + } + if (((int16_t) wlEngine->getRequestorId) == -1) { + wlEngine->setRequestorId(nextRequestorId++); + } +} + +AddrRangeList +MPU::MPURespPort::getAddrRanges() const +{ + return owner->getAddrRanges(); +} + +bool +MPU::MPURespPort::recvTimingReq(PacketPtr pkt) +{ + return wlEngine->handleWLUpdate(pkt); +} + +Tick +MPU::MPURespPort::recvAtomic(PacketPtr pkt) +{ + panic("recvAtomic unimpl."); +} + +void +MPU::MPURespPort::recvFunctional(PacketPtr pkt) +{ + owner->recvFunctional(pkt); +} + +void +MPU::MPURespPort::recvRespRetry() +{ + panic("recvRespRetry from response port is called."); +} + +void +MPU::MPUReqPort::sendPacket(PacketPtr pkt) +{ + panic_if(_blocked, "Should never try to send if blocked MemSide!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + blockedPacket = pkt; + _blocked = true; + } +} + +bool +MPU::MPUReqPort::recvTimingResp(PacketPtr pkt) +{ + panic("recvTimingResp called on the request port."); +} + +void +MPU::MPUReqPort::recvReqRetry() +{ + panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); + + _blocked = false; + sendPacket(blockedPacket); + + if (!blocked()) { + blockedPacket = nullptr; + } +} + +bool +MPU::MPUMemPort::recvTimingResp(PacketPtr pkt) +{ + return owner->handleMemResp(pkt); +} + +void +MPU::MPUMemPort::sendPacket(PacketPtr pkt) +{ + panic_if(_blocked, "Should never try to send if blocked MemSide!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + blockedPacket = pkt; + _blocked = true; + } +} + +void +MPU::MPUMemPort::recvReqRetry() +{ + panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); + + _blocked = false; + sendPacket(blockedPacket); + + if (!blocked()) { + blockedPacket = nullptr; + } +} + +AddrRangeList +MPU::getAddrRanges() +{ + return memPort.getAddrRanges(); +} + +void +MPU::recvFunctional(PacketPtr pkt) +{ + if (pkt->isUpdateWL()) { + panic("Functional requests should not be made to WL.") + //TODO: Might be a good idea to implement later. + // wlEngine->recvFunctional(pkt); + } else { + memPort.recvFuctional(pkt); + } +} + +bool +MPU::handleMemReq(PacketPtr pkt) +{ + return memPort.recvTimingReq(pkt); +} + +void +MPU::handleMemResp(PacketPtr pkt) +{ + //TODO: Implement this; +} + +bool +MPU::recvWLNotif(WorkListItem wl) +{ + return applyEngine->recvWLUpdate(wl); +} + +bool +MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) +{ + return pushEngine->recvApplyUpdate(prop, degree, edgeIndex); +} + +bool +MPU::recvPushUpdate(PacketPtr pkt) +{ + // TODO: Implement this Mahyar +} diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index e69de29bb2..bc4ba5d53b 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_MPU_HH__ +#define __ACCL_GRAPH_SEGA_MPU_HH__ + +#include "accl/graph/base/util.hh" +#include "accl/graph/sega/apply_engine.hh" +#include "accl/graph/sega/push_engine.hh" +#include "accl/graph/sega/wl_engine.hh" +#include "base/addr_range.hh" +#include "mem/port.hh" +#include "mem/packet.hh" +#include "params/MPU.hh" +#include "sim/clocked_object.hh" + +class MPU : public ClockedObject +{ + private: + class MPURespPort : public ResponsePort + { + private: + MPU* owner; + + public: + MPURespPort(const std::string& name, MPU* owner): + ResponsePort(name, owner), owner(owner) + {} + virtual AddrRangeList getAddrRanges() const; + + protected: + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt); + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRespRetry(); + }; + + class MPUReqPort : public RequestPort + { + private: + MPU* owner; + bool _blocked; + PacketPtr blockedPacket; + + public: + MPUReqPort(const std::string& name, MPU* owner): + RequestPort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} + void sendPacket(PacketPtr pkt); + bool blocked() { return _blocked; } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + class MPUMemPort : public RequestPort + { + private: + MPU* owner; + bool _blocked; + PacketPtr blockedPacket; + + public: + MemPort(const std::string& name, MPU* owner): + RequestPort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} + + void sendPacket(PacketPtr pkt); + bool blocked() { return _blocked; } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + virtual void startup(); + + RequestorID nextRequestorId; + + MPURespPort respPort; + MPUReqPort reqPort; + MPUMemPort memPort; + + ApplyEngine* applyEngine; + PushEngine* pushEngine; + WLEngine* wlEngine; + + AddrRangeList getAddrRanges(); + void recvFunctional(PacketPtr pkt); + + bool handleMemReq(PacketPtr pkt); + void handleMemResp(PacketPtr pkt); + + bool recvWLNotif(WorkListItem wl); + bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex); + bool recvPushUpdate(PacketPtr pkt); + + public: + + MPU(const MPUParams ¶ms); + + Port& getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; +} + +#endif // __ACCL_GRAPH_SEGA_MPU_HH__ \ No newline at end of file diff --git a/src/mem/packet.hh b/src/mem/packet.hh index 5332ee32a2..44c44d08a6 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -178,6 +178,7 @@ class MemCmd IsPrint, //!< Print state matching address (for debugging) IsFlush, //!< Flush the address from caches FromCache, //!< Request originated from a caching agent + UpdateWL, // MPU Accelerator NUM_COMMAND_ATTRIBUTES }; @@ -267,6 +268,8 @@ class MemCmd cmd == ReadCleanReq || cmd == ReadSharedReq); } + bool isUpdateWL() const {return testCmdAttrib(updateWL);} + Command responseCommand() const { From 1b1bbac7eedbbf1dfc1f8a5d1495227c6a87e789 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Sun, 20 Feb 2022 15:46:10 -0800 Subject: [PATCH 040/247] Adding changes to ApplyEngine and WLEngine --- src/accl/graph/base/base_apply_engine.hh | 28 ++++-------------------- src/accl/graph/base/base_wl_engine.hh | 26 +++++----------------- 2 files changed, 9 insertions(+), 45 deletions(-) diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh index 3304e58a92..d603cb2713 100644 --- a/src/accl/graph/base/base_apply_engine.hh +++ b/src/accl/graph/base/base_apply_engine.hh @@ -73,31 +73,8 @@ class BaseApplyEngine : public ClockedObject {} }; - class MemPort : public RequestPort - { - private: - BaseApplyEngine *owner; - bool _blocked; - PacketPtr blockedPacket; - - public: - MemPort(const std::string& name, BaseApplyEngine* owner): - RequestPort(name, owner), owner(owner), - _blocked(false), blockedPacket(nullptr) - {} - - void sendPacket(PacketPtr pkt); - bool blocked(){ return _blocked;} - - protected: - virtual bool recvTimingResp(PacketPtr pkt); - void recvReqRetry() override; - }; - const RequestorID requestorId; - MemPort memPort; - ApplyQueue applyReadQueue; ApplyQueue applyWriteQueue; @@ -106,11 +83,14 @@ class BaseApplyEngine : public ClockedObject bool handleWL(PacketPtr pkt); EventFunctionWrapper nextApplyCheckEvent; void processNextApplyCheckEvent(); - + //FIXME: make void bool handleMemResp(PacketPtr resp); EventFunctionWrapper nextApplyEvent; void processNextApplyEvent(); + protected: + virtual void sendMemReq(PacketPtr pkt) = 0; + public: BaseApplyEngine(const ApplyParams &apply); diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh index 3a683bb6e4..0530c64c72 100644 --- a/src/accl/graph/base/base_wl_engine.hh +++ b/src/accl/graph/base/base_wl_engine.hh @@ -77,26 +77,6 @@ class BaseWLEngine : public ClockedObject sendPktRetry(false){} }; - class MemPort : public RequestPort - { - private: - WLEngine *owner; - bool _blocked; - PacketPtr blockedPacket; - - public: - MemPort(const std::string& name, WLEngine* owner): - RequestPort(name, owner), owner(owner), - _blocked(false), blockedPacket(nullptr) - {} - void sendPacket(PacketPtr pkt); - bool blocked() { return _blocked; } - - protected: - virtual bool recvTimingResp(PacketPtr pkt); - void recvReqRetry() override; - }; - RequestorID requestorId; MemPort memPort; WLQueue updateQueue; @@ -113,6 +93,7 @@ class BaseWLEngine : public ClockedObject If there are any active vertecies: create memory read packets + MPU::MPU::MemPortsendTimingReq */ + //FIXME: make void bool handleMemResp(PacketPtr resp); EventFunctionWrapper nextWLReduceEvent; void processNextWLReduceEvent(); @@ -121,8 +102,11 @@ class BaseWLEngine : public ClockedObject read + write Write edgelist loc in buffer */ + protected: + virtual void sendMemReq(PacketPtr pkt) = 0; + virtual void sendApplyReq(WorkListItem wl) = 0; - public: + public: BaseWLEngine(const BaseWLEngineParams ¶ms); Port& getPort(const std::string &if_name, From 64080f26149dd3295e452b1e842e2fef1ef8613c Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Sun, 20 Feb 2022 22:39:08 -0800 Subject: [PATCH 041/247] Finished restructured for ApplyE and WLE, pre-compiled --- src/accl/graph/base/BaseApplyEngine.py | 9 +-- src/accl/graph/base/SConscript | 8 +- src/accl/graph/base/base_apply_engine.cc | 94 +++++++++--------------- src/accl/graph/base/base_apply_engine.hh | 13 ++-- src/accl/graph/base/base_wl_engine.cc | 78 +++++--------------- src/accl/graph/base/base_wl_engine.hh | 17 ++--- src/accl/graph/sega/SConscript | 37 ++++++++++ src/accl/graph/sega/apply_engine.cc | 48 ++++++++++++ src/accl/graph/sega/apply_engine.hh | 54 ++++++++++++++ src/accl/graph/sega/wl_engine.cc | 50 +++++++++++++ src/accl/graph/sega/wl_engine.hh | 57 ++++++++++++++ 11 files changed, 321 insertions(+), 144 deletions(-) create mode 100644 src/accl/graph/sega/SConscript diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py index 80aa430139..23fdfbb08a 100644 --- a/src/accl/graph/base/BaseApplyEngine.py +++ b/src/accl/graph/base/BaseApplyEngine.py @@ -29,10 +29,9 @@ from m5.proxy import * from m5.objects.ClockedObject import ClockedObject -class Apply(ClockedObject): - type = 'Apply' - cxx_header = "accl/apply.hh" - cxx_class = 'gem5::Apply' +class BaseApplyEngine(ClockedObject): + type = 'BaseApplyEngine' + cxx_header = "accl/base_apply_engine.hh" + cxx_class = 'gem5::BaseApplyEngine' applyQueueSize = Param.Unsigned(32, "Size of write queue") - memPort = RequestPort("Memory side port, sends requests") diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript index a881fa1e6e..cc55100064 100644 --- a/src/accl/graph/base/SConscript +++ b/src/accl/graph/base/SConscript @@ -27,11 +27,11 @@ Import('*') -SimObject('Apply.py') +SimObject('BaseApplyEngine.py') SimObject('BasePushEngine.py') -SimObject('WLEngine.py') +SimObject('BaseWLEngine.py') -Source('apply.cc') +Source('base_apply_engine.cc') Source('base_push_engine.cc') -Source('wl_engine.cc') +Source('base_wl_engine.cc') Source('util.cc') diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc index 111ea16f2e..805a7649b7 100644 --- a/src/accl/graph/base/base_apply_engine.cc +++ b/src/accl/graph/base/base_apply_engine.cc @@ -37,8 +37,7 @@ namespace gem5 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams ¶ms): ClockedObject(params), - requestorId(0), - memPort(name() + ".memPort", this), + requestorId(-1), applyReadQueue(params.applyQueueSize), applyWriteQueue(params.applyQueueSize), nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()), @@ -48,11 +47,7 @@ BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams ¶ms): Port & BaseApplyEngine::getPort(const std::string &if_name, PortID idx) { - if (if_name == "memPort") { - return memPort; - } else { return SimObject::getPort(if_name, idx); - } } RequestorID @@ -67,29 +62,6 @@ BaseApplyEngine::setRequestorId(RequestorID requestorId) this->requestorId = requestorId; } -void -BaseApplyEngine::ApplyMemPort::sendPacket(PacketPtr pkt) -{ - if (!sendTimingReq(pkt)) { - blockedPacket = pkt; - _blocked = true; - } -} - -bool -BaseApplyEngine::ApplyMemPort::recvTimingResp(PacketPtr pkt) -{ - return owner->handleMemResp(pkt); -} - -void -BaseApplyEngine::ApplyMemPort::recvReqRetry() -{ - _blocked = false; - sendPacket(blockedPacket); - blockedPacket = nullptr; -} - bool BaseApplyEngine::handleWL(PacketPtr pkt){ auto queue = applyReadQueue; if (queue.blocked()){ @@ -106,19 +78,19 @@ bool BaseApplyEngine::handleWL(PacketPtr pkt){ void BaseApplyEngine::processNextApplyCheckEvent(){ auto queue = applyReadQueue; - if (!memPort.blocked()){ - PacketPtr pkt = queue.front(); - if (queue.sendPktRetry && !queue.blocked()){ - // respPort.trySendRetry(); - queue.sendPktRetry = false; - } - // conver to ReadReq - Addr req_addr = (pkt->getAddr() / 64) * 64; - int req_offset = (pkt->getAddr()) % 64; - RequestPtr request = std::make_shared(req_addr, 64, 0 ,0); - PacketPtr memPkt = new Packet(request, MemCmd::ReadReq); - requestOffset[request] = req_offset; - memPort.sendPacket(memPkt); + // if (!memPort.blocked()){ + PacketPtr pkt = queue.front(); + // if (queue.sendPktRetry && !queue.blocked()){ + // // respPort.trySendRetry(); + // queue.sendPktRetry = false; + // } + // conver to ReadReq + Addr req_addr = (pkt->getAddr() / 64) * 64; + int req_offset = (pkt->getAddr()) % 64; + RequestPtr request = std::make_shared(req_addr, 64, 0 ,0); + PacketPtr memPkt = new Packet(request, MemCmd::ReadReq); + requestOffset[request] = req_offset; + if (parent.sendMemReq(memPkt)){ queue.pop(); } if (!queue.empty() && !nextApplyCheckEvent.scheduled()){ @@ -157,26 +129,26 @@ BaseApplyEngine::processNextApplyEvent(){ uint32_t temp_prop = wl.temp_prop; if (temp_prop != prop){ - if (!memPort.blocked() && !reqPort.blocked()){ - //update prop with temp_prop - if(prop < temp_prop){ - wl.prop = prop; - }else{ - wl.prop = temp_prop; - } - //write back the new worklist item to memory - uint8_t* wList = workListToMemory(wl); - memcpy(data + request_offset, wList, sizeof(WorkListItem)); - //Create memory write requests. - PacketPtr writePkt = - getWritePacket(pkt->getAddr(), 64, data, requestorId); - memPort.sendPacket(writePkt); - reqPort.sendPacket(writePkt); + // if (!memPort.blocked() && !reqPort.blocked()){ + //update prop with temp_prop + if(prop < temp_prop){ + wl.prop = prop; + }else{ + wl.prop = temp_prop; + } + //write back the new worklist item to memory + uint8_t* wList = workListToMemory(wl); + memcpy(data + request_offset, wList, sizeof(WorkListItem)); + //Create memory write requests. + PacketPtr writePkt = + getWritePacket(pkt->getAddr(), 64, data, requestorId); + if (parent.sendMemReq(writePkt) && + parent.recvApplyNotif(WorkListItem.prop, + WorkListItem.degree, + WorkListItem.edgeIndex)){ queue.pop(); - if (queue.sendPktRetry && !queue.blocked()){ - // memPort.trySendRetry(); - queue.sendPktRetry = false; - } + // memPort.trySendRetry(); + // queue.sendPktRetry = false; } }else{ queue.applyQueue.pop(); diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh index d603cb2713..27d906f060 100644 --- a/src/accl/graph/base/base_apply_engine.hh +++ b/src/accl/graph/base/base_apply_engine.hh @@ -26,8 +26,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __ACCL_APPLY_HH__ -#define __ACCL_APPLY_HH__ +#ifndef __ACCL_BASEAPPLY_HH__ +#define __ACCL_BASEAPPLY_HH__ #include #include @@ -83,13 +83,14 @@ class BaseApplyEngine : public ClockedObject bool handleWL(PacketPtr pkt); EventFunctionWrapper nextApplyCheckEvent; void processNextApplyCheckEvent(); - //FIXME: make void - bool handleMemResp(PacketPtr resp); + + void handleMemResp(PacketPtr resp); EventFunctionWrapper nextApplyEvent; void processNextApplyEvent(); protected: - virtual void sendMemReq(PacketPtr pkt) = 0; + virtual bool sendMemReq(PacketPtr pkt) = 0; + virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0; public: BaseApplyEngine(const ApplyParams &apply); @@ -103,4 +104,4 @@ class BaseApplyEngine : public ClockedObject } -#endif // __ACCL_APPLY_HH__ +#endif // __BASEACCL_APPLY_HH__ diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc index dec37636ba..4af6f5e326 100644 --- a/src/accl/graph/base/base_wl_engine.cc +++ b/src/accl/graph/base/base_wl_engine.cc @@ -37,8 +37,7 @@ namespace gem5 BaseWLEngine::BaseWLEngine(const BaseWLEngineParams ¶ms): ClockedObject(params), - requestorId(0), - memPort(name() + ".memPort", this), + requestorId(-1), updateQueue(params.wlQueueSize), responseQueue(params.wlQueueSize), nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()), @@ -48,11 +47,7 @@ BaseWLEngine::BaseWLEngine(const BaseWLEngineParams ¶ms): Port & BaseWLEngine::getPort(const std::string &if_name, PortID idx) { - if (if_name == "memPort") { - return memPort; - } else { - return SimObject::getPort(if_name, idx); - } + return SimObject::getPort(if_name, idx); } RequestorID @@ -67,31 +62,6 @@ BaseWLEngine::setRequestorId(RequestorID requestorId) this->requestorId = requestorId; } -void -BaseWLEngine::WLMemPort::sendPacket(PacketPtr pkt) -{ - if (!sendTimingReq(pkt)) { - blockedPacket = pkt; - _blocked = true; - } -} - -void -BaseWLEngine::WLMemPort::recvReqRetry() -{ - // We should have a blocked packet if this function is called. - assert(_blocked && blockedPacket != nullptr); - _blocked = false; - sendPacket(blockedPacket); - blockedPacket = nullptr; -} - -bool -BaseWLEngine::WLMemPort::recvTimingResp(PacketPtr pkt) -{ - return owner->handleMemResp(pkt); -} - bool BaseWLEngine::handleWLUpdate(PacketPtr pkt){ auto queue = updateQueue; @@ -109,20 +79,16 @@ BaseWLEngine::handleWLUpdate(PacketPtr pkt){ void BaseWLEngine::processNextWLReadEvent(){ auto queue = updateQueue; - while (!queue.empty()){ //create a map instead of front - PacketPtr pkt = queue.front(); - /// conver to ReadReq - Addr req_addr = (pkt->getAddr() / 64) * 64; - int req_offset = (pkt->getAddr()) % 64; - RequestPtr request = - std::make_shared(req_addr, 64, 0 ,0); - PacketPtr memPkt = new Packet(request, MemCmd::ReadReq); - requestOffset[request] = req_offset; - if (!memPort.blocked()){ - queue.pop(); - memPort.sendPacket(memPkt); - break; - } + PacketPtr pkt = queue.front(); + /// conver to ReadReq + Addr req_addr = (pkt->getAddr() / 64) * 64; + int req_offset = (pkt->getAddr()) % 64; + RequestPtr request = + std::make_shared(req_addr, 64, 0 ,0); + PacketPtr memPkt = new Packet(request, MemCmd::ReadReq); + requestOffset[request] = req_offset; + if (parent.sendMemReq()){ + queue.pop(); } if(!queue.empty() && !nextWLReadEvent.scheduled()){ schedule(nextWLReadEvent, nextCycle()); @@ -150,7 +116,6 @@ void BaseWLEngine::processNextWLReduceEvent(){ auto queue = responseQueue; auto updateQ = updateQueue; - auto applyPort = reqPort; PacketPtr update = updateQ.front(); uint8_t* value = update->getPtr(); PacketPtr pkt = queue.front(); @@ -164,17 +129,16 @@ BaseWLEngine::processNextWLReduceEvent(){ if(*value < temp_prop){ temp_prop = *value; } - if (!memPort.blocked() && !applyPort.blocked()){ - wl.temp_prop = temp_prop; - uint8_t* wlItem = workListToMemory(wl); - memcpy(data + request_offset, wlItem, sizeof(WorkListItem)); - PacketPtr writePkt = - getWritePacket(pkt->getAddr(), 64, data, requestorId); - memPort.sendPacket(writePkt); - applyPort.sendPacket(writePkt); + // if (!memPort.blocked() && !applyPort.blocked()){ + wl.temp_prop = temp_prop; + uint8_t* wlItem = workListToMemory(wl); + memcpy(data + request_offset, wlItem, sizeof(WorkListItem)); + PacketPtr writePkt = + getWritePacket(pkt->getAddr(), 64, data, requestorId); + if (parent.sendMemReq(writePkt) && + parent.sendWLNotif(writePkt)) { queue.pop(); if (!queue.blocked() && queue.sendPktRetry){ - // memPort.trySendRetry(); queue.sendPktRetry = false; } updateQ.pop(); @@ -187,12 +151,10 @@ BaseWLEngine::processNextWLReduceEvent(){ else{ queue.pop(); if (!queue.blocked() && queue.sendPktRetry){ - // memPort.trySendRetry(); queue.sendPktRetry = false; } updateQ.pop(); if (!updateQ.blocked() & updateQ.sendPktRetry){ - // respPort.trySendRetry(); updateQ.sendPktRetry = false; } diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh index 0530c64c72..1d0f3e33c1 100644 --- a/src/accl/graph/base/base_wl_engine.hh +++ b/src/accl/graph/base/base_wl_engine.hh @@ -26,8 +26,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __ACCL_WLE_HH__ -#define __ACCL_WLE_HH__ +#ifndef __ACCL_BASEWLENGINE_HH__ +#define __ACCL_BASEWLENGINE_HH__ #include #include @@ -78,23 +78,19 @@ class BaseWLEngine : public ClockedObject }; RequestorID requestorId; - MemPort memPort; WLQueue updateQueue; WLQueue responseQueue; std::unordered_map requestOffset; //Events - //FIXME: make handleWLUpdate public - bool handleWLUpdate(PacketPtr pkt); EventFunctionWrapper nextWLReadEvent; void processNextWLReadEvent(); /* Syncronously checked If there are any active vertecies: create memory read packets + MPU::MPU::MemPortsendTimingReq */ - //FIXME: make void - bool handleMemResp(PacketPtr resp); + void handleMemResp(PacketPtr resp); EventFunctionWrapper nextWLReduceEvent; void processNextWLReduceEvent(); /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp @@ -103,8 +99,8 @@ class BaseWLEngine : public ClockedObject Write edgelist loc in buffer */ protected: - virtual void sendMemReq(PacketPtr pkt) = 0; - virtual void sendApplyReq(WorkListItem wl) = 0; + virtual bool sendMemReq(PacketPtr pkt) = 0; + virtual bool sendWLNotif(WorkListItem wl) = 0; public: BaseWLEngine(const BaseWLEngineParams ¶ms); @@ -114,8 +110,9 @@ class BaseWLEngine : public ClockedObject RequestorID getRequestorId(); void setRequestorId(RequestorId requestorId); + bool handleWLUpdate(PacketPtr pkt); }; } -#endif // __ACCL_WLE_HH__ +#endif // __ACCL_BASEWLENGINE_HH__ diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript new file mode 100644 index 0000000000..79afe3b7d0 --- /dev/null +++ b/src/accl/graph/sega/SConscript @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2016 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Import('*') + +SimObject('ApplyEngine.py') +SimObject('MPU.py') +SimObject('WLEngine.py') + +Source('apply_engine.cc') +Source('mpu.cc') +Source('push_engine.cc') +Source('wl_engine.cc') diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc index e69de29bb2..41a568bd27 100644 --- a/src/accl/graph/sega/apply_engine.cc +++ b/src/accl/graph/sega/apply_engine.cc @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/apply_engine.hh" + +namespace gem5{ + +ApplyEngine:ApplyEngine(const BaseApplyEngine ¶ms): + BaseApplyEngine(params) +{} + +virtual bool +ApplyEngine::sendMemReq(PacketPtr pkt){ + return mpu->handleMemReq(pkt); +} + +virtual bool +ApplyEngine::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){ + mpu->recvApplyNotif(prop, degree, edgeIndex); + +} + +} \ No newline at end of file diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh index e69de29bb2..fd2bca008f 100644 --- a/src/accl/graph/sega/apply_engine.hh +++ b/src/accl/graph/sega/apply_engine.hh @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_APPLY_HH__ +#define __ACCL_APPLY_HH__ + +#include +#include + +#include "accl/graph/base/base_apply_engine.hh" +#include "mem/packet.hh" +#include "mem/port.hh" +#include "params/ApplyEngine.hh" +#include "sim/clocked_object.hh" +#include "sim/port.hh" + +namespace gem5 +{ + +class ApplyEngine : public BaseApplyEngine +{ + private: + MPU mpu; + protected: + virtual bool sendMemReq(PacketPtr pkt); + virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex); + public: + ApplyEngine(const ApplyEngineParams ¶ms); +} diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index e69de29bb2..9608d0cbc4 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/wl_engine.hh" + +#include + +namespace gem5 +{ + +WLEngine::WLEngine(const WLEngineParams ¶ms): + BaseWLEngine(params) +{} + +virtual bool +WLEngine::sendMemReq(PacketPtr pkt){ + return mpu->handleMemReq(pkt); +} + +// FIXME: handle the case where Apply queue is full +virtual bool +WLEngine::sendWLNotif(WorkListItem wl){ + mpu->recvWLNotif(wl); + return true; +} \ No newline at end of file diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index e69de29bb2..eee6b1f22f 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_WLENGINE_HH__ +#define __ACCL_WLENGINE_HH__ + +#include +#include + +#include "accl/graph/base/base_wl_engine.hh" +#include "base/addr_range.hh" +#include "mem/port.hh" +#include "mem/packet.hh" +#include "params/WLEngine.hh" +#include "sim/clocked_object.hh" +#include "sim/port.hh" +#include "sim/system.hh" + + +namespace gem5 +{ + +class WLEngine : public BaseWorkListEngine +{ + private: + MPU* mpu; + protected: + virtual bool sendMemReq(PacketPtr pkt); + virtual bool sendWLNotif(WorkListItem wl); + public: + WLEngine(const WLEngineParams ¶ms); +} \ No newline at end of file From c6ce909250341eed9d6fe814c45eb402dad0d3b7 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 20 Feb 2022 23:31:49 -0800 Subject: [PATCH 042/247] Finished restructure for PushEngine. Pre-compile. --- src/accl/graph/base/base_push_engine.cc | 30 +++++-------- src/accl/graph/base/base_push_engine.hh | 19 +++++++- src/accl/graph/sega/mpu.cc | 29 +++++++++++-- src/accl/graph/sega/mpu.hh | 2 +- src/accl/graph/sega/push_engine.cc | 58 +++++++++++++++++++++++++ src/accl/graph/sega/push_engine.hh | 55 +++++++++++++++++++++++ 6 files changed, 169 insertions(+), 24 deletions(-) diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc index c4388cab4b..6871154276 100644 --- a/src/accl/graph/base/base_push_engine.cc +++ b/src/accl/graph/base/base_push_engine.cc @@ -33,7 +33,8 @@ namespace gem5 { -BasePushEngine::BasePushEngine(const BasePushEngine ¶ms) : ClockedObject(params), +BasePushEngine::BasePushEngine(const BasePushEngine ¶ms) : + ClockedObject(params), requestorId(0), memPort(name() + ".memPort", this), // vertexQueueSize(params.vertex_queue_size), @@ -103,7 +104,8 @@ BasePushEngine::startup() bool -BasePushEngine::handleUpdate(PacketPtr pkt) +BasePushEngine::recvApplyNotif(uint32_t prop, + uint32_t degree, uint32_t edge_index) { //FIXME: There should be a check if the queues are full. // if (vertexQueueLen < vertexQueueSize) { @@ -115,7 +117,7 @@ BasePushEngine::handleUpdate(PacketPtr pkt) // return true; // } // return false; - vertexQueue.push(pkt); + notifQueue.emplace(prop, degree, edge_index); if (!nextReceiveEvent.scheduled()) { schedule(nextReceiveEvent, nextCycle()); } @@ -125,21 +127,15 @@ BasePushEngine::handleUpdate(PacketPtr pkt) void BasePushEngine::processNextReceiveEvent() { - PacketPtr updatePkt = vertexQueue.front(); - uint8_t* data = updatePkt->getPtr(); - - // data: (edge_index: 32 bits, degree: 32 bits, value: 32 bits) - uint32_t edge_index = *((uint32_t *)data); - uint32_t degree = *((uint32_t *)(data + 4)); - uint32_t value = *((uint32_t *)(data + 8)); + ApplyNotif notif = notifQueue.front(); std::vector addr_queue; std::vector offset_queue; std::vector num_edge_queue; - for (uint32_t index = 0; index < degree; index++) { + for (uint32_t index = 0; index < notif.degree; index++) { // FIXME: For now the base edge address is 1048576 - Addr edge_addr = 1048576 + (edge_index + index) * sizeof(Edge); + Addr edge_addr = 1048576 + (notif.edge_index + index) * sizeof(Edge); Addr req_addr = (edge_addr / 64) * 64; Addr req_offset = edge_addr % 64; if (addr_queue.size()) { @@ -164,10 +160,10 @@ BasePushEngine::processNextReceiveEvent() memReqQueue.push(pkt); reqOffsetMap[pkt->req] = offset_queue[index]; reqNumEdgeMap[pkt->req] = num_edge_queue[index]; - reqValueMap[pkt->req] = value; + reqValueMap[pkt->req] = notif.prop; } - vertexQueue.pop(); + notifQueue.pop(); if (!nextReadEvent.scheduled() && !memReqQueue.empty()) { schedule(nextReadEvent, nextCycle()); @@ -178,8 +174,7 @@ void BasePushEngine::processNextReadEvent() { PacketPtr pkt = memReqQueue.front(); - if (!memPort.blocked()) { - memPort.sendPacket(pkt); + if (!sendMemReq(pkt)) { memReqQueue.pop(); } @@ -226,8 +221,7 @@ void BasePushEngine::processNextSendEvent() { PacketPtr pkt = updateQueue.front(); - if (!reqPort.blocked()) { - reqPort.sendPacket(pkt); + if (!sendPushUpdate(pkt)) { updateQueue.pop(); } diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh index 2265bb32db..63ad3a6652 100644 --- a/src/accl/graph/base/base_push_engine.hh +++ b/src/accl/graph/base/base_push_engine.hh @@ -42,11 +42,22 @@ namespace gem5 class BasePushEngine : public ClockedObject { private: + + struct ApplyNotif { + uint32_t prop; + uint32_t degree; + uint32_t edgeIndex; + + ApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index): + prop(prop), degree(degree), edgeIndex(edge_index) + {} + }; + virtual void startup() override; RequestorID requestorId; - std::queue vertexQueue; + std::queue notifQueue; // int vertexQueueSize; // int vertexQueueLen; @@ -60,7 +71,7 @@ class BasePushEngine : public ClockedObject // int updateQueueSize; // int updateQueueLen; - bool handleUpdate(PacketPtr pkt); + bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index); EventFunctionWrapper nextReceiveEvent; void processNextReceiveEvent(); @@ -71,6 +82,10 @@ class BasePushEngine : public ClockedObject EventFunctionWrapper nextSendEvent; void processNextSendEvent(); + protected: + virtual bool sendMemRequest(PacketPtr pkt) = 0; + virtual bool sendPushUpdate(PacketPtr pkt) = 0; + public: BasePushEngine(const PushEngineParams ¶ms); diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc index c45ad78ef9..09ab23a835 100644 --- a/src/accl/graph/sega/mpu.cc +++ b/src/accl/graph/sega/mpu.cc @@ -161,7 +161,16 @@ MPU::handleMemReq(PacketPtr pkt) void MPU::handleMemResp(PacketPtr pkt) { - //TODO: Implement this; + RequestorID requestorId = pkt->requestorId(); + if (applyEngine->getRequestorId() == requestorId) { + applyEngine->handleMemResp(pkt); + } else if (pushEngine->getRequestorId() == requestorId) { + pushEngine->handleMemResp(pkt); + } else if (wlEngine->getRequestorId() == requestorId) { + wlEngine->handleMemResp(pkt); + } else { + panic("Received a response with an unknown requestorId."); + } } bool @@ -173,11 +182,25 @@ MPU::recvWLNotif(WorkListItem wl) bool MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) { - return pushEngine->recvApplyUpdate(prop, degree, edgeIndex); + return pushEngine->recvApplyUpdate(prop, degree, edge_index); } bool MPU::recvPushUpdate(PacketPtr pkt) { - // TODO: Implement this Mahyar + Addr addr = pkt->getAddr(); + for (auto addr_range: memPort.getAddrRangeList()) { + if (addr_range.contains(addr)) { + if (!memPort.sendPacket(pkt)) { + return false; + } + return true; + } + } + + if (!reqPort.sendPacket(pkt)) { + return false; + } + return true; + } diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index bc4ba5d53b..93d1dd8bb3 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -120,7 +120,7 @@ class MPU : public ClockedObject void handleMemResp(PacketPtr pkt); bool recvWLNotif(WorkListItem wl); - bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex); + bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index); bool recvPushUpdate(PacketPtr pkt); public: diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index e69de29bb2..e43512c6f4 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/push_engine.hh" + +namespace gem5 +{ + +PushEngine::PushEngine(const PushEngine ¶ms) : + BasePushEngine(params), + owner(params.mpu) +{ +} + +Port & +PushEngine::getPort(const std::string &if_name, PortID idx) +{ + return SimObject::getPort(if_name, idx); +} + +bool +PushEngine::sendMemReq(PacketPtr) +{ + return owner->handleMemReq(pkt); +} + +bool +PushEngine::sendPushUpdate(PacketPtr pkt) +{ + return owner->recvPushUpdate(pkt); +} + +} diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index e69de29bb2..54ef72d5f9 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ + +#include "accl/graph/base/base_push_engine.hh" + +namespace gem5 +{ +class PushEngine : public BasePushEngine +{ + private: + MPU* owner; + + protected: + virtual bool sendMemRequest(PacketPtr pkt); + virtual bool sendPushUpdate(PacketPtr pkt); + + public: + PushEngine(const PushEngineParams ¶ms); + + Port& getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; + +} + +} + +#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ \ No newline at end of file From 8a2dae86375bd48db32f494343df2fc9d5d35816 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Sun, 20 Feb 2022 23:51:02 -0800 Subject: [PATCH 043/247] Debugging. --- src/accl/graph/base/base_apply_engine.cc | 31 +++++++++--------------- src/accl/graph/base/base_apply_engine.hh | 13 +++++----- src/accl/graph/base/base_push_engine.hh | 9 ++++--- src/accl/graph/base/base_wl_engine.cc | 6 ++--- src/accl/graph/base/base_wl_engine.hh | 9 ++++--- src/accl/graph/base/util.cc | 2 +- src/accl/graph/sega/ApplyEngine.py | 7 ++---- src/accl/graph/sega/MPU.py | 6 ++--- src/accl/graph/sega/PushEngine.py | 16 +++++------- src/accl/graph/sega/SConscript | 1 + src/accl/graph/sega/WLEngine.py | 7 ++---- src/accl/graph/sega/apply_engine.cc | 6 ++--- src/accl/graph/sega/apply_engine.hh | 10 +++++--- src/accl/graph/sega/mpu.cc | 11 ++++++--- src/accl/graph/sega/mpu.hh | 5 ++++ src/accl/graph/sega/wl_engine.hh | 9 ++++--- 16 files changed, 75 insertions(+), 73 deletions(-) diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc index 805a7649b7..301f5931bf 100644 --- a/src/accl/graph/base/base_apply_engine.cc +++ b/src/accl/graph/base/base_apply_engine.cc @@ -26,7 +26,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "accl/base_apply_engine.hh" +#include "accl/graph/base/base_apply_engine.hh" #include @@ -90,7 +90,7 @@ void BaseApplyEngine::processNextApplyCheckEvent(){ RequestPtr request = std::make_shared(req_addr, 64, 0 ,0); PacketPtr memPkt = new Packet(request, MemCmd::ReadReq); requestOffset[request] = req_offset; - if (parent.sendMemReq(memPkt)){ + if (sendMemReq(memPkt)){ queue.pop(); } if (!queue.empty() && !nextApplyCheckEvent.scheduled()){ @@ -98,22 +98,13 @@ void BaseApplyEngine::processNextApplyCheckEvent(){ } } -bool +void BaseApplyEngine::handleMemResp(PacketPtr pkt) { - auto queue = applyWriteQueue; - - if (queue.blocked()){ - queue.sendPktRetry = true; - return false; - } else - queue.push(pkt); - - if(!nextApplyEvent.scheduled()){ - schedule(nextApplyEvent, nextCycle()); - } - return true; - return true; + // FIXME: change the event, remove the retry parts + if(!nextApplyEvent.scheduled()){ + schedule(nextApplyEvent, nextCycle()); + } } void @@ -142,10 +133,10 @@ BaseApplyEngine::processNextApplyEvent(){ //Create memory write requests. PacketPtr writePkt = getWritePacket(pkt->getAddr(), 64, data, requestorId); - if (parent.sendMemReq(writePkt) && - parent.recvApplyNotif(WorkListItem.prop, - WorkListItem.degree, - WorkListItem.edgeIndex)){ + if (sendMemReq(writePkt) && + recvApplyNotif(wl.prop, + wl.degree, + wl.edgeIndex)){ queue.pop(); // memPort.trySendRetry(); // queue.sendPktRetry = false; diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh index 27d906f060..56b43cfb7b 100644 --- a/src/accl/graph/base/base_apply_engine.hh +++ b/src/accl/graph/base/base_apply_engine.hh @@ -26,14 +26,15 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __ACCL_BASEAPPLY_HH__ -#define __ACCL_BASEAPPLY_HH__ +#ifndef __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__ +#define __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__ #include #include #include "mem/packet.hh" #include "mem/port.hh" +#include "mem/request.hh" #include "params/BaseApplyEngine.hh" #include "sim/clocked_object.hh" #include "sim/port.hh" @@ -73,7 +74,7 @@ class BaseApplyEngine : public ClockedObject {} }; - const RequestorID requestorId; + RequestorID requestorId; ApplyQueue applyReadQueue; ApplyQueue applyWriteQueue; @@ -93,15 +94,15 @@ class BaseApplyEngine : public ClockedObject virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0; public: - BaseApplyEngine(const ApplyParams &apply); + BaseApplyEngine(const BaseApplyEngineParams &apply); Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; RequestorID getRequestorId(); - void setRequestorId(RequestorId requestorId); + void setRequestorId(RequestorID requestorId); }; } -#endif // __BASEACCL_APPLY_HH__ +#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__ diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh index 63ad3a6652..873cb26b3d 100644 --- a/src/accl/graph/base/base_push_engine.hh +++ b/src/accl/graph/base/base_push_engine.hh @@ -26,12 +26,13 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __ACCL_PUSH_ENGINE_HH__ -#define __ACCL_PUSH_ENGINE_HH__ +#ifndef __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__ +#define __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__ #include #include "mem/port.hh" +#include "mem/request.hh" #include "mem/packet.hh" #include "params/BasePushEngine.hh" #include "sim/clocked_object.hh" @@ -94,10 +95,10 @@ class BasePushEngine : public ClockedObject PortID idx=InvalidPortID) override; RequestorID getRequestorId(); - void setRequestorId(RequestorId requestorId); + void setRequestorId(RequestorID requestorId); }; } -#endif // __ACCL_PUSH_ENGINE_HH__ +#endif // __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__ diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc index 4af6f5e326..b863b38e19 100644 --- a/src/accl/graph/base/base_wl_engine.cc +++ b/src/accl/graph/base/base_wl_engine.cc @@ -87,7 +87,7 @@ void BaseWLEngine::processNextWLReadEvent(){ std::make_shared(req_addr, 64, 0 ,0); PacketPtr memPkt = new Packet(request, MemCmd::ReadReq); requestOffset[request] = req_offset; - if (parent.sendMemReq()){ + if (sendMemReq()){ queue.pop(); } if(!queue.empty() && !nextWLReadEvent.scheduled()){ @@ -135,8 +135,8 @@ BaseWLEngine::processNextWLReduceEvent(){ memcpy(data + request_offset, wlItem, sizeof(WorkListItem)); PacketPtr writePkt = getWritePacket(pkt->getAddr(), 64, data, requestorId); - if (parent.sendMemReq(writePkt) && - parent.sendWLNotif(writePkt)) { + if (sendMemReq(writePkt) && + sendWLNotif(writePkt)) { queue.pop(); if (!queue.blocked() && queue.sendPktRetry){ queue.sendPktRetry = false; diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh index 1d0f3e33c1..3d807d8b06 100644 --- a/src/accl/graph/base/base_wl_engine.hh +++ b/src/accl/graph/base/base_wl_engine.hh @@ -26,12 +26,13 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __ACCL_BASEWLENGINE_HH__ -#define __ACCL_BASEWLENGINE_HH__ +#ifndef __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__ +#define __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__ #include #include +#include "accl/graph/base/util.hh" #include "base/addr_range.hh" #include "mem/port.hh" #include "mem/packet.hh" @@ -109,10 +110,10 @@ class BaseWLEngine : public ClockedObject PortID idx=InvalidPortID) override; RequestorID getRequestorId(); - void setRequestorId(RequestorId requestorId); + void setRequestorId(RequestorID requestorId); bool handleWLUpdate(PacketPtr pkt); }; } -#endif // __ACCL_BASEWLENGINE_HH__ +#endif // __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__ diff --git a/src/accl/graph/base/util.cc b/src/accl/graph/base/util.cc index 40a1fc761b..0baa374714 100644 --- a/src/accl/graph/base/util.cc +++ b/src/accl/graph/base/util.cc @@ -26,7 +26,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "accl/util.hh" +#include "accl/graph/base/util.hh" namespace gem5 { diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/ApplyEngine.py index 0d03e71e54..bb43836ff7 100644 --- a/src/accl/graph/sega/ApplyEngine.py +++ b/src/accl/graph/sega/ApplyEngine.py @@ -25,16 +25,13 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from build.NULL.python.m5.proxy import Parent from m5.params import * from m5.proxy import * -from m5.objects.ClockedObject import ClockedObject -# FIXME: update these to correct files from m5.objects.BaseApplyEngine import BaseApplyEngine class ApplyEngine(BaseApplyEngine): type = 'ApplyEngine' cxx_header = "accl/graph/sega/apply_engine.hh" - cxx_class = 'gem5::MPU' + cxx_class = 'gem5::ApplyEngine' - mpu = Param.MPU(Parent, "MPU object that owns this ApplyEngine") + mpu = Param.MPU(Parent.any, "MPU object that owns this ApplyEngine") diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py index 923c1a2f38..046dfaf4e8 100644 --- a/src/accl/graph/sega/MPU.py +++ b/src/accl/graph/sega/MPU.py @@ -29,9 +29,9 @@ from m5.proxy import * from m5.objects.ClockedObject import ClockedObject -from m5.objects.WLEngine import WLEngine -from m5.objects.PushEngine import PushEngine -from m5.objects.ApplyEngine import ApplyEngine +# from m5.objects.WLEngine import WLEngine +# from m5.objects.PushEngine import PushEngine +# from m5.objects.ApplyEngine import ApplyEngine class MPU(ClockedObject): type = 'MPU' diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py index fa9d921a26..eb0eed18ab 100644 --- a/src/accl/graph/sega/PushEngine.py +++ b/src/accl/graph/sega/PushEngine.py @@ -27,15 +27,11 @@ from m5.params import * from m5.proxy import * -from m5.objects.ClockedObject import ClockedObject +from m5.objects.BasePushEngine import BasePushEngine -from m5.objects.WLEngine import WLEngine -from m5.objects.PushEngine import PushEngine -from m5.objects.ApplyEngine import ApplyEngine +class PushEngine(BasePushEngine): + type = 'PushEngine' + cxx_header = "accl/graph/sega/push_engine.hh" + cxx_class = 'gem5::PushEngine' -class MPU(ClockedObject): - type = 'MPU' - cxx_header = "accl/graph/sega/mpu.hh" - cxx_class = 'gem5::MPU' - - mpu = Param.MPU(Parent, "The MPU object than owns this PushEngine.") + mpu = Param.MPU(Parent.any, "MPU object that owns this PushEngine") diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index 79afe3b7d0..dc19ece06b 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -29,6 +29,7 @@ Import('*') SimObject('ApplyEngine.py') SimObject('MPU.py') +SimObject('PushEngine.py') SimObject('WLEngine.py') Source('apply_engine.cc') diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py index a8f3bd20ea..12fbcf9b4f 100644 --- a/src/accl/graph/sega/WLEngine.py +++ b/src/accl/graph/sega/WLEngine.py @@ -25,16 +25,13 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from build.NULL.python.m5.proxy import Parent from m5.params import * from m5.proxy import * -from m5.objects.ClockedObject import ClockedObject -# FIXME: update these to correct files from m5.objects.BaseWLEngine import BaseWLEngine class WLEngine(BaseWLEngine): type = 'WLEngine' cxx_header = "accl/graph/sega/wl_engine.hh" - cxx_class = 'gem5::MPU' + cxx_class = 'gem5::WLEngine' - mpu = Param.MPU(Parent, "MPU object that owns this WLEngine") \ No newline at end of file + mpu = Param.MPU(Parent.any, "MPU object that owns this WLEngine") \ No newline at end of file diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc index 41a568bd27..64ae71e290 100644 --- a/src/accl/graph/sega/apply_engine.cc +++ b/src/accl/graph/sega/apply_engine.cc @@ -30,16 +30,16 @@ namespace gem5{ -ApplyEngine:ApplyEngine(const BaseApplyEngine ¶ms): +ApplyEngine::ApplyEngine(const BaseApplyEngine ¶ms): BaseApplyEngine(params) {} -virtual bool +bool ApplyEngine::sendMemReq(PacketPtr pkt){ return mpu->handleMemReq(pkt); } -virtual bool +bool ApplyEngine::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){ mpu->recvApplyNotif(prop, degree, edgeIndex); diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh index fd2bca008f..855ebbd8b0 100644 --- a/src/accl/graph/sega/apply_engine.hh +++ b/src/accl/graph/sega/apply_engine.hh @@ -26,8 +26,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __ACCL_APPLY_HH__ -#define __ACCL_APPLY_HH__ +#ifndef __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__ #include #include @@ -45,10 +45,14 @@ namespace gem5 class ApplyEngine : public BaseApplyEngine { private: - MPU mpu; + MPU* mpu; protected: virtual bool sendMemReq(PacketPtr pkt); virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex); public: ApplyEngine(const ApplyEngineParams ¶ms); +}; + } + +#endif // __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__ diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc index 09ab23a835..27f7c8e314 100644 --- a/src/accl/graph/sega/mpu.cc +++ b/src/accl/graph/sega/mpu.cc @@ -28,16 +28,19 @@ #include "accl/graph/sega/mpu.hh" +namespace gem5 +{ + void MPU::startup() { - if (((int16_t) applyEngine->getRequestorId) == -1) { + if (((int16_t) applyEngine->getRequestorId()) == -1) { applyEngine->setRequestorId(nextRequestorId++); } - if (((int16_t) pushEngine->getRequestorId) == -1) { + if (((int16_t) pushEngine->getRequestorId()) == -1) { pushEngine->setRequestorId(nextRequestorId++); } - if (((int16_t) wlEngine->getRequestorId) == -1) { + if (((int16_t) wlEngine->getRequestorId()) == -1) { wlEngine->setRequestorId(nextRequestorId++); } } @@ -204,3 +207,5 @@ MPU::recvPushUpdate(PacketPtr pkt) return true; } + +} diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index 93d1dd8bb3..b37821c200 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -39,6 +39,9 @@ #include "params/MPU.hh" #include "sim/clocked_object.hh" +namespace gem5 +{ + class MPU : public ClockedObject { private: @@ -129,6 +132,8 @@ class MPU : public ClockedObject Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; +}; + } #endif // __ACCL_GRAPH_SEGA_MPU_HH__ \ No newline at end of file diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index eee6b1f22f..938128e05f 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -26,8 +26,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __ACCL_WLENGINE_HH__ -#define __ACCL_WLENGINE_HH__ +#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ #include #include @@ -54,4 +54,7 @@ class WLEngine : public BaseWorkListEngine virtual bool sendWLNotif(WorkListItem wl); public: WLEngine(const WLEngineParams ¶ms); -} \ No newline at end of file +}; + +} +#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ \ No newline at end of file From c57c564598e55741ed4c33194e7e0c2750efe9c1 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 21 Feb 2022 14:40:15 -0800 Subject: [PATCH 044/247] Lots of debugging. --- src/accl/graph/base/BaseApplyEngine.py | 2 +- src/accl/graph/base/BasePushEngine.py | 1 - src/accl/graph/base/BaseWLEngine.py | 4 +- src/accl/graph/base/base_apply_engine.cc | 105 ++++++++++------------ src/accl/graph/base/base_apply_engine.hh | 40 ++------- src/accl/graph/base/base_push_engine.cc | 45 +--------- src/accl/graph/base/base_push_engine.hh | 10 +-- src/accl/graph/base/base_wl_engine.cc | 6 +- src/accl/graph/base/base_wl_engine.hh | 6 +- src/accl/graph/sega/MPU.py | 6 +- src/accl/graph/sega/apply_engine.cc | 10 ++- src/accl/graph/sega/apply_engine.hh | 9 +- src/accl/graph/sega/mpu.cc | 107 +++++++++++++++++++---- src/accl/graph/sega/mpu.hh | 20 ++--- src/accl/graph/sega/push_engine.cc | 11 +-- src/accl/graph/sega/push_engine.hh | 12 ++- src/accl/graph/sega/wl_engine.cc | 19 ++-- src/accl/graph/sega/wl_engine.hh | 13 ++- src/mem/packet.hh | 3 - 19 files changed, 217 insertions(+), 212 deletions(-) diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py index 23fdfbb08a..45d94b3fd2 100644 --- a/src/accl/graph/base/BaseApplyEngine.py +++ b/src/accl/graph/base/BaseApplyEngine.py @@ -31,7 +31,7 @@ class BaseApplyEngine(ClockedObject): type = 'BaseApplyEngine' - cxx_header = "accl/base_apply_engine.hh" + cxx_header = "accl/graph/base/base_apply_engine.hh" cxx_class = 'gem5::BaseApplyEngine' applyQueueSize = Param.Unsigned(32, "Size of write queue") diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py index 6ed5d25978..891221c06d 100644 --- a/src/accl/graph/base/BasePushEngine.py +++ b/src/accl/graph/base/BasePushEngine.py @@ -34,4 +34,3 @@ class BasePushEngine(ClockedObject): cxx_header = "accl/graph/base/base_push_engine.hh" cxx_class = 'gem5::BasePushEngine' - memPort = RequestPort("Port to communicate with the memory") diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py index 7384e876ef..3ecf030138 100644 --- a/src/accl/graph/base/BaseWLEngine.py +++ b/src/accl/graph/base/BaseWLEngine.py @@ -31,8 +31,8 @@ class BaseWLEngine(ClockedObject): type = 'BaseWLEngine' - cxx_header = "accl/base_wl_engine.hh" + cxx_header = "accl/graph/base/base_wl_engine.hh" cxx_class = 'gem5::BaseWLEngine' wlQueueSize = Param.Unsigned(32, "Size of write queue") - memPort = RequestPort("Memory side port, sends requests") + diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc index 301f5931bf..731cd5c345 100644 --- a/src/accl/graph/base/base_apply_engine.cc +++ b/src/accl/graph/base/base_apply_engine.cc @@ -38,8 +38,7 @@ namespace gem5 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams ¶ms): ClockedObject(params), requestorId(-1), - applyReadQueue(params.applyQueueSize), - applyWriteQueue(params.applyQueueSize), + queueSize(params.applyQueueSize), nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()), nextApplyEvent([this]{ processNextApplyEvent(); }, name()) {} @@ -62,14 +61,14 @@ BaseApplyEngine::setRequestorId(RequestorID requestorId) this->requestorId = requestorId; } -bool BaseApplyEngine::handleWL(PacketPtr pkt){ - auto queue = applyReadQueue; - if (queue.blocked()){ - queue.sendPktRetry = true; - return false; - } else{ - queue.push(pkt); - } +bool BaseApplyEngine::recvWLNotif(Addr addr){ + // TODO: Investigate the situation where the queue is full. + // if (applyReadQueue.size() == queueSize){ + // // applyReadQueue.sendPktRetry = true; + // return true; + // } else{ + applyReadQueue.push(addr); + // } if (!nextApplyCheckEvent.scheduled()){ schedule(nextApplyCheckEvent, nextCycle()); } @@ -77,78 +76,64 @@ bool BaseApplyEngine::handleWL(PacketPtr pkt){ } void BaseApplyEngine::processNextApplyCheckEvent(){ - auto queue = applyReadQueue; - // if (!memPort.blocked()){ - PacketPtr pkt = queue.front(); - // if (queue.sendPktRetry && !queue.blocked()){ - // // respPort.trySendRetry(); - // queue.sendPktRetry = false; - // } - // conver to ReadReq - Addr req_addr = (pkt->getAddr() / 64) * 64; - int req_offset = (pkt->getAddr()) % 64; + Addr addr = applyReadQueue.front(); + Addr req_addr = (addr / 64) * 64; + int req_offset = (addr % 64); RequestPtr request = std::make_shared(req_addr, 64, 0 ,0); PacketPtr memPkt = new Packet(request, MemCmd::ReadReq); requestOffset[request] = req_offset; if (sendMemReq(memPkt)){ - queue.pop(); + applyReadQueue.pop(); } - if (!queue.empty() && !nextApplyCheckEvent.scheduled()){ + if (!applyReadQueue.empty() && !nextApplyCheckEvent.scheduled()){ schedule(nextApplyCheckEvent, nextCycle()); } } -void +bool BaseApplyEngine::handleMemResp(PacketPtr pkt) { // FIXME: change the event, remove the retry parts + applyWriteQueue.push(pkt); if(!nextApplyEvent.scheduled()){ schedule(nextApplyEvent, nextCycle()); } + return true; } void BaseApplyEngine::processNextApplyEvent(){ - auto queue = applyWriteQueue; - PacketPtr pkt = queue.front(); - uint8_t* data = pkt->getPtr(); + PacketPtr pkt = applyWriteQueue.front(); + uint8_t* data = pkt->getPtr(); - RequestPtr request = pkt->req; - int request_offset = requestOffset[request]; - WorkListItem wl = memoryToWorkList(data + request_offset); - uint32_t prop = wl.prop; - uint32_t temp_prop = wl.temp_prop; + RequestPtr request = pkt->req; + int request_offset = requestOffset[request]; + WorkListItem wl = memoryToWorkList(data + request_offset); + uint32_t prop = wl.prop; + uint32_t temp_prop = wl.temp_prop; - if (temp_prop != prop){ - // if (!memPort.blocked() && !reqPort.blocked()){ - //update prop with temp_prop - if(prop < temp_prop){ - wl.prop = prop; - }else{ - wl.prop = temp_prop; - } - //write back the new worklist item to memory - uint8_t* wList = workListToMemory(wl); - memcpy(data + request_offset, wList, sizeof(WorkListItem)); - //Create memory write requests. - PacketPtr writePkt = - getWritePacket(pkt->getAddr(), 64, data, requestorId); - if (sendMemReq(writePkt) && - recvApplyNotif(wl.prop, - wl.degree, - wl.edgeIndex)){ - queue.pop(); - // memPort.trySendRetry(); - // queue.sendPktRetry = false; - } - }else{ - queue.applyQueue.pop(); - if (queue.sendPktRetry && !queue.blocked()){ - // memPort.trySendRetry(); - queue.sendPktRetry = false; - } + if (temp_prop != prop) { + // TODO: instead of min add a Reduce function. + //update prop with temp_prop + if(prop < temp_prop) { + wl.prop = prop; + }else { + wl.prop = temp_prop; + } + //write back the new worklist item to memory + uint8_t* wList = workListToMemory(wl); + memcpy(data + request_offset, wList, sizeof(WorkListItem)); + //Create memory write requests. + PacketPtr writePkt = + getWritePacket(pkt->getAddr(), 64, data, requestorId); + if (sendMemReq(writePkt) && + sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) { + applyWriteQueue.pop(); } - if(!queue.empty() && !nextApplyEvent.scheduled()){ + }else { + applyWriteQueue.pop(); + } + if(!applyWriteQueue.empty() && !nextApplyEvent.scheduled()){ schedule(nextApplyEvent, nextCycle()); } } diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh index 56b43cfb7b..b7c0db90cb 100644 --- a/src/accl/graph/base/base_apply_engine.hh +++ b/src/accl/graph/base/base_apply_engine.hh @@ -45,53 +45,24 @@ namespace gem5 class BaseApplyEngine : public ClockedObject { private: - //FIXME: Remove queue defenition from here. - struct ApplyQueue{ - std::queue applyQueue; - const uint32_t queueSize; - bool sendPktRetry; - - bool blocked(){ - return (applyQueue.size() == queueSize); - } - bool empty(){ - return applyQueue.empty(); - } - void push(PacketPtr pkt){ - applyQueue.push(pkt); - } - - void pop(){ - applyQueue.pop(); - } - - PacketPtr front(){ - return applyQueue.front(); - } - - ApplyQueue(uint32_t qSize): - queueSize(qSize) - {} - }; RequestorID requestorId; - ApplyQueue applyReadQueue; - ApplyQueue applyWriteQueue; + std::queue applyReadQueue; + std::queue applyWriteQueue; + int queueSize; std::unordered_map requestOffset; - bool handleWL(PacketPtr pkt); EventFunctionWrapper nextApplyCheckEvent; void processNextApplyCheckEvent(); - void handleMemResp(PacketPtr resp); EventFunctionWrapper nextApplyEvent; void processNextApplyEvent(); protected: virtual bool sendMemReq(PacketPtr pkt) = 0; - virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0; + virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0; public: BaseApplyEngine(const BaseApplyEngineParams &apply); @@ -101,6 +72,9 @@ class BaseApplyEngine : public ClockedObject RequestorID getRequestorId(); void setRequestorId(RequestorID requestorId); + + bool recvWLNotif(Addr addr); + bool handleMemResp(PacketPtr resp); }; } diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc index 6871154276..d93cbdf8da 100644 --- a/src/accl/graph/base/base_push_engine.cc +++ b/src/accl/graph/base/base_push_engine.cc @@ -33,10 +33,9 @@ namespace gem5 { -BasePushEngine::BasePushEngine(const BasePushEngine ¶ms) : +BasePushEngine::BasePushEngine(const BasePushEngineParams ¶ms) : ClockedObject(params), - requestorId(0), - memPort(name() + ".memPort", this), + requestorId(-1), // vertexQueueSize(params.vertex_queue_size), // vertexQueueLen(0), // updateQueue(params.update_queue_size), @@ -65,44 +64,6 @@ BasePushEngine::setRequestorId(RequestorID requestorId) this->requestorId = requestorId; } -void -BasePushEngine::startup() -{ - //FIXME: This is the current version of our initializer. - // This should be updated in the future. - WorkListItem vertices [5] = { - {0, 0, 3, 0}, // Addr: 0 - {0, 0, 1, 3}, // Addr: 16 - {0, 0, 1, 4}, // Addr: 32 - {0, 0, 0, 5}, // Addr: 48 - {0, 0, 0, 5} // Addr: 64 - }; - Edge edges [6] = { - {0, 16}, // Addr: 1048576 - {0, 32}, // Addr: 1048592 - {0, 48}, // Addr: 1048608 - {0, 32}, // Addr: 1048624 - {0, 64} // Addr: 1048640 - }; - - for (int i = 0; i < 5; i++) { - uint8_t* data = workListToMemory(vertices[i]); - PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem), - 16, data, requestorId); - memPort.sendFunctional(pkt); - } - - for (int i = 0; i < 6; i++) { - uint8_t* data = edgeToMemory(edges[i]); - PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge), - 16, data, requestorId); - memPort.sendFunctional(pkt); - } - -} - - - bool BasePushEngine::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index) @@ -135,7 +96,7 @@ BasePushEngine::processNextReceiveEvent() for (uint32_t index = 0; index < notif.degree; index++) { // FIXME: For now the base edge address is 1048576 - Addr edge_addr = 1048576 + (notif.edge_index + index) * sizeof(Edge); + Addr edge_addr = 1048576 + (notif.edgeIndex + index) * sizeof(Edge); Addr req_addr = (edge_addr / 64) * 64; Addr req_offset = edge_addr % 64; if (addr_queue.size()) { diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh index 873cb26b3d..c723932975 100644 --- a/src/accl/graph/base/base_push_engine.hh +++ b/src/accl/graph/base/base_push_engine.hh @@ -54,8 +54,6 @@ class BasePushEngine : public ClockedObject {} }; - virtual void startup() override; - RequestorID requestorId; std::queue notifQueue; @@ -72,24 +70,22 @@ class BasePushEngine : public ClockedObject // int updateQueueSize; // int updateQueueLen; - bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index); EventFunctionWrapper nextReceiveEvent; void processNextReceiveEvent(); EventFunctionWrapper nextReadEvent; void processNextReadEvent(); - bool handleMemResp(PacketPtr pkt); EventFunctionWrapper nextSendEvent; void processNextSendEvent(); protected: - virtual bool sendMemRequest(PacketPtr pkt) = 0; + virtual bool sendMemReq(PacketPtr pkt) = 0; virtual bool sendPushUpdate(PacketPtr pkt) = 0; public: - BasePushEngine(const PushEngineParams ¶ms); + BasePushEngine(const BasePushEngineParams ¶ms); Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; @@ -97,6 +93,8 @@ class BasePushEngine : public ClockedObject RequestorID getRequestorId(); void setRequestorId(RequestorID requestorId); + bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index); + bool handleMemResp(PacketPtr pkt); }; } diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc index b863b38e19..806ab4a6c3 100644 --- a/src/accl/graph/base/base_wl_engine.cc +++ b/src/accl/graph/base/base_wl_engine.cc @@ -30,8 +30,6 @@ #include -#include "accl/graph/base/util.hh" - namespace gem5 { @@ -87,7 +85,7 @@ void BaseWLEngine::processNextWLReadEvent(){ std::make_shared(req_addr, 64, 0 ,0); PacketPtr memPkt = new Packet(request, MemCmd::ReadReq); requestOffset[request] = req_offset; - if (sendMemReq()){ + if (sendMemReq(memPkt)){ queue.pop(); } if(!queue.empty() && !nextWLReadEvent.scheduled()){ @@ -136,7 +134,7 @@ BaseWLEngine::processNextWLReduceEvent(){ PacketPtr writePkt = getWritePacket(pkt->getAddr(), 64, data, requestorId); if (sendMemReq(writePkt) && - sendWLNotif(writePkt)) { + sendWLNotif(writePkt->getAddr())) { queue.pop(); if (!queue.blocked() && queue.sendPktRetry){ queue.sendPktRetry = false; diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh index 3d807d8b06..a2cab4c7e2 100644 --- a/src/accl/graph/base/base_wl_engine.hh +++ b/src/accl/graph/base/base_wl_engine.hh @@ -91,7 +91,7 @@ class BaseWLEngine : public ClockedObject If there are any active vertecies: create memory read packets + MPU::MPU::MemPortsendTimingReq */ - void handleMemResp(PacketPtr resp); + EventFunctionWrapper nextWLReduceEvent; void processNextWLReduceEvent(); /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp @@ -101,7 +101,7 @@ class BaseWLEngine : public ClockedObject */ protected: virtual bool sendMemReq(PacketPtr pkt) = 0; - virtual bool sendWLNotif(WorkListItem wl) = 0; + virtual bool sendWLNotif(Addr addr) = 0; public: BaseWLEngine(const BaseWLEngineParams ¶ms); @@ -111,7 +111,9 @@ class BaseWLEngine : public ClockedObject RequestorID getRequestorId(); void setRequestorId(RequestorID requestorId); + bool handleWLUpdate(PacketPtr pkt); + bool handleMemResp(PacketPtr resp); }; } diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py index 046dfaf4e8..68cfb3d42d 100644 --- a/src/accl/graph/sega/MPU.py +++ b/src/accl/graph/sega/MPU.py @@ -38,9 +38,9 @@ class MPU(ClockedObject): cxx_header = "accl/graph/sega/mpu.hh" cxx_class = 'gem5::MPU' - workListEngine = Param.WLEngine("WLEngine object to connect to " + work_list_engine = Param.WLEngine("WLEngine object to connect to " "This MPU") - applyEngine = Param.ApplyEngine("ApplyEngine object to connect to " + apply_engine = Param.ApplyEngine("ApplyEngine object to connect to " "This MPU") - pushEngine = Param.PushEngine("PushEngine object to connect to " + push_engine = Param.PushEngine("PushEngine object to connect to " "This MPU") diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc index 64ae71e290..bc45850041 100644 --- a/src/accl/graph/sega/apply_engine.cc +++ b/src/accl/graph/sega/apply_engine.cc @@ -27,11 +27,13 @@ */ #include "accl/graph/sega/apply_engine.hh" +#include "accl/graph/sega/mpu.hh" namespace gem5{ -ApplyEngine::ApplyEngine(const BaseApplyEngine ¶ms): - BaseApplyEngine(params) +ApplyEngine::ApplyEngine(const ApplyEngineParams ¶ms) : + BaseApplyEngine(params), + mpu(params.mpu) {} bool @@ -40,9 +42,9 @@ ApplyEngine::sendMemReq(PacketPtr pkt){ } bool -ApplyEngine::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){ +ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){ mpu->recvApplyNotif(prop, degree, edgeIndex); - + return true; } } \ No newline at end of file diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh index 855ebbd8b0..17e3280cb5 100644 --- a/src/accl/graph/sega/apply_engine.hh +++ b/src/accl/graph/sega/apply_engine.hh @@ -42,14 +42,21 @@ namespace gem5 { +class MPU; + class ApplyEngine : public BaseApplyEngine { private: + MPU* mpu; + protected: + virtual bool sendMemReq(PacketPtr pkt); - virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex); + virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex); + public: + ApplyEngine(const ApplyEngineParams ¶ms); }; diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc index 27f7c8e314..4824bcd699 100644 --- a/src/accl/graph/sega/mpu.cc +++ b/src/accl/graph/sega/mpu.cc @@ -31,6 +31,31 @@ namespace gem5 { +MPU::MPU(const MPUParams ¶ms): + ClockedObject(params), + nextRequestorId(0), + respPort(name() + ".respPort", this), + reqPort(name() + ".reqPort", this), + memPort(name() + ".memPort", this), + applyEngine(params.apply_engine), + pushEngine(params.push_engine), + wlEngine(params.work_list_engine) +{} + +Port& +MPU::getPort(const std::string &if_name, PortID idx) +{ + if (if_name == "respPort") { + return respPort; + } else if (if_name == "reqPort") { + return reqPort; + } else if (if_name == "memPort") { + return memPort; + } else { + return SimObject::getPort(if_name, idx); + } +} + void MPU::startup() { @@ -43,6 +68,37 @@ MPU::startup() if (((int16_t) wlEngine->getRequestorId()) == -1) { wlEngine->setRequestorId(nextRequestorId++); } + + //FIXME: This is the current version of our initializer. + // This should be updated in the future. + WorkListItem vertices [5] = { + {0, 0, 3, 0}, // Addr: 0 + {0, 0, 1, 3}, // Addr: 16 + {0, 0, 1, 4}, // Addr: 32 + {0, 0, 0, 5}, // Addr: 48 + {0, 0, 0, 5} // Addr: 64 + }; + Edge edges [6] = { + {0, 16}, // Addr: 1048576 + {0, 32}, // Addr: 1048592 + {0, 48}, // Addr: 1048608 + {0, 32}, // Addr: 1048624 + {0, 64} // Addr: 1048640 + }; + + for (int i = 0; i < 5; i++) { + uint8_t* data = workListToMemory(vertices[i]); + PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem), + 16, data, 0); + memPort.sendFunctional(pkt); + } + + for (int i = 0; i < 6; i++) { + uint8_t* data = edgeToMemory(edges[i]); + PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge), + 16, data, 0); + memPort.sendFunctional(pkt); + } } AddrRangeList @@ -54,7 +110,7 @@ MPU::MPURespPort::getAddrRanges() const bool MPU::MPURespPort::recvTimingReq(PacketPtr pkt) { - return wlEngine->handleWLUpdate(pkt); + return owner->handleWLUpdate(pkt); } Tick @@ -106,12 +162,6 @@ MPU::MPUReqPort::recvReqRetry() } } -bool -MPU::MPUMemPort::recvTimingResp(PacketPtr pkt) -{ - return owner->handleMemResp(pkt); -} - void MPU::MPUMemPort::sendPacket(PacketPtr pkt) { @@ -124,6 +174,14 @@ MPU::MPUMemPort::sendPacket(PacketPtr pkt) } } +bool +MPU::MPUMemPort::recvTimingResp(PacketPtr pkt) +{ + //TODO: Investigate sending true all the time + owner->handleMemResp(pkt); + return true; +} + void MPU::MPUMemPort::recvReqRetry() { @@ -146,19 +204,21 @@ MPU::getAddrRanges() void MPU::recvFunctional(PacketPtr pkt) { - if (pkt->isUpdateWL()) { - panic("Functional requests should not be made to WL.") + if (pkt->cmd == MemCmd::UpdateWL) { + panic("Functional requests should not be made to WL."); //TODO: Might be a good idea to implement later. // wlEngine->recvFunctional(pkt); } else { - memPort.recvFuctional(pkt); + memPort.sendFunctional(pkt); } } bool MPU::handleMemReq(PacketPtr pkt) { - return memPort.recvTimingReq(pkt); + //TODO: Investigate sending true all the time + memPort.sendPacket(pkt); + return true; } void @@ -177,33 +237,42 @@ MPU::handleMemResp(PacketPtr pkt) } bool -MPU::recvWLNotif(WorkListItem wl) +MPU::handleWLUpdate(PacketPtr pkt) +{ + return wlEngine->handleWLUpdate(pkt); +} + +bool +MPU::recvWLNotif(Addr addr) { - return applyEngine->recvWLUpdate(wl); + return applyEngine->recvWLNotif(addr); } bool -MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) +MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index) { - return pushEngine->recvApplyUpdate(prop, degree, edge_index); + return pushEngine->recvApplyNotif(prop, degree, edge_index); } bool MPU::recvPushUpdate(PacketPtr pkt) { Addr addr = pkt->getAddr(); - for (auto addr_range: memPort.getAddrRangeList()) { + for (auto addr_range: memPort.getAddrRanges()) { if (addr_range.contains(addr)) { - if (!memPort.sendPacket(pkt)) { + if (memPort.blocked()) { return false; + } else { + memPort.sendPacket(pkt); + return true; } - return true; } } - if (!reqPort.sendPacket(pkt)) { + if (reqPort.blocked()) { return false; } + reqPort.sendPacket(pkt); return true; } diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index b37821c200..be5139c0e0 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -29,7 +29,6 @@ #ifndef __ACCL_GRAPH_SEGA_MPU_HH__ #define __ACCL_GRAPH_SEGA_MPU_HH__ -#include "accl/graph/base/util.hh" #include "accl/graph/sega/apply_engine.hh" #include "accl/graph/sega/push_engine.hh" #include "accl/graph/sega/wl_engine.hh" @@ -91,7 +90,7 @@ class MPU : public ClockedObject PacketPtr blockedPacket; public: - MemPort(const std::string& name, MPU* owner): + MPUMemPort(const std::string& name, MPU* owner): RequestPort(name, owner), owner(owner), _blocked(false), blockedPacket(nullptr) {} @@ -119,21 +118,22 @@ class MPU : public ClockedObject AddrRangeList getAddrRanges(); void recvFunctional(PacketPtr pkt); - bool handleMemReq(PacketPtr pkt); - void handleMemResp(PacketPtr pkt); - - bool recvWLNotif(WorkListItem wl); - bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index); - bool recvPushUpdate(PacketPtr pkt); - public: MPU(const MPUParams ¶ms); Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; + + bool handleMemReq(PacketPtr pkt); + void handleMemResp(PacketPtr pkt); + + bool handleWLUpdate(PacketPtr pkt); + bool recvWLNotif(Addr addr); + bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index); + bool recvPushUpdate(PacketPtr pkt); }; } -#endif // __ACCL_GRAPH_SEGA_MPU_HH__ \ No newline at end of file +#endif // __ACCL_GRAPH_SEGA_MPU_HH__ diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index e43512c6f4..922ae32ed2 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -27,13 +27,14 @@ */ #include "accl/graph/sega/push_engine.hh" +#include "accl/graph/sega/mpu.hh" namespace gem5 { -PushEngine::PushEngine(const PushEngine ¶ms) : +PushEngine::PushEngine(const PushEngineParams ¶ms) : BasePushEngine(params), - owner(params.mpu) + mpu(params.mpu) { } @@ -44,15 +45,15 @@ PushEngine::getPort(const std::string &if_name, PortID idx) } bool -PushEngine::sendMemReq(PacketPtr) +PushEngine::sendMemReq(PacketPtr pkt) { - return owner->handleMemReq(pkt); + return mpu->handleMemReq(pkt); } bool PushEngine::sendPushUpdate(PacketPtr pkt) { - return owner->recvPushUpdate(pkt); + return mpu->recvPushUpdate(pkt); } } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 54ef72d5f9..e4bb83d2bc 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -30,16 +30,20 @@ #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ #include "accl/graph/base/base_push_engine.hh" +#include "params/PushEngine.hh" namespace gem5 { + +class MPU; + class PushEngine : public BasePushEngine { private: - MPU* owner; + MPU* mpu; protected: - virtual bool sendMemRequest(PacketPtr pkt); + virtual bool sendMemReq(PacketPtr pkt); virtual bool sendPushUpdate(PacketPtr pkt); public: @@ -48,8 +52,8 @@ class PushEngine : public BasePushEngine Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; -} +}; } -#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ \ No newline at end of file +#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 9608d0cbc4..40ec755969 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -27,24 +27,25 @@ */ #include "accl/graph/sega/wl_engine.hh" - -#include - +#include "accl/graph/sega/mpu.hh" namespace gem5 { WLEngine::WLEngine(const WLEngineParams ¶ms): - BaseWLEngine(params) + BaseWLEngine(params), + mpu(params.mpu) {} -virtual bool +bool WLEngine::sendMemReq(PacketPtr pkt){ return mpu->handleMemReq(pkt); } // FIXME: handle the case where Apply queue is full -virtual bool -WLEngine::sendWLNotif(WorkListItem wl){ - mpu->recvWLNotif(wl); +bool +WLEngine::sendWLNotif(Addr addr){ + mpu->recvWLNotif(addr); return true; -} \ No newline at end of file +} + +} diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 938128e05f..c5f49ff6a2 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -45,16 +45,23 @@ namespace gem5 { -class WLEngine : public BaseWorkListEngine +// class MPU; + +class WLEngine : public BaseWLEngine { private: + MPU* mpu; + protected: + virtual bool sendMemReq(PacketPtr pkt); - virtual bool sendWLNotif(WorkListItem wl); + virtual bool sendWLNotif(Addr addr); + public: + WLEngine(const WLEngineParams ¶ms); }; } -#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ \ No newline at end of file +#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ diff --git a/src/mem/packet.hh b/src/mem/packet.hh index 44c44d08a6..5332ee32a2 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -178,7 +178,6 @@ class MemCmd IsPrint, //!< Print state matching address (for debugging) IsFlush, //!< Flush the address from caches FromCache, //!< Request originated from a caching agent - UpdateWL, // MPU Accelerator NUM_COMMAND_ATTRIBUTES }; @@ -268,8 +267,6 @@ class MemCmd cmd == ReadCleanReq || cmd == ReadSharedReq); } - bool isUpdateWL() const {return testCmdAttrib(updateWL);} - Command responseCommand() const { From 8967f89ddfe20c155706993789344c5eff701d3c Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 21 Feb 2022 14:59:31 -0800 Subject: [PATCH 045/247] Style fix. --- src/accl/graph/base/BaseApplyEngine.py | 2 +- src/accl/graph/base/BasePushEngine.py | 1 - src/accl/graph/base/BaseWLEngine.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py index 45d94b3fd2..e48b425b01 100644 --- a/src/accl/graph/base/BaseApplyEngine.py +++ b/src/accl/graph/base/BaseApplyEngine.py @@ -31,7 +31,7 @@ class BaseApplyEngine(ClockedObject): type = 'BaseApplyEngine' - cxx_header = "accl/graph/base/base_apply_engine.hh" + cxx_header = 'accl/graph/base/base_apply_engine.hh' cxx_class = 'gem5::BaseApplyEngine' applyQueueSize = Param.Unsigned(32, "Size of write queue") diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py index 891221c06d..793b0a7c92 100644 --- a/src/accl/graph/base/BasePushEngine.py +++ b/src/accl/graph/base/BasePushEngine.py @@ -33,4 +33,3 @@ class BasePushEngine(ClockedObject): type = 'BasePushEngine' cxx_header = "accl/graph/base/base_push_engine.hh" cxx_class = 'gem5::BasePushEngine' - diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py index 3ecf030138..473fd05313 100644 --- a/src/accl/graph/base/BaseWLEngine.py +++ b/src/accl/graph/base/BaseWLEngine.py @@ -35,4 +35,3 @@ class BaseWLEngine(ClockedObject): cxx_class = 'gem5::BaseWLEngine' wlQueueSize = Param.Unsigned(32, "Size of write queue") - From fa48d321dd41debc82f39646adf23ad780ca05a7 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 21 Feb 2022 15:20:51 -0800 Subject: [PATCH 046/247] Adding PARAMS macro. --- src/accl/graph/base/SConscript | 6 +++--- src/accl/graph/base/base_apply_engine.hh | 2 ++ src/accl/graph/base/base_push_engine.hh | 2 ++ src/accl/graph/base/base_wl_engine.hh | 3 +++ src/accl/graph/sega/SConscript | 8 ++++---- src/accl/graph/sega/apply_engine.hh | 2 +- src/accl/graph/sega/mpu.hh | 2 +- src/accl/graph/sega/push_engine.hh | 1 + src/accl/graph/sega/wl_engine.hh | 2 +- 9 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript index cc55100064..5e82a44971 100644 --- a/src/accl/graph/base/SConscript +++ b/src/accl/graph/base/SConscript @@ -27,9 +27,9 @@ Import('*') -SimObject('BaseApplyEngine.py') -SimObject('BasePushEngine.py') -SimObject('BaseWLEngine.py') +SimObject('BaseApplyEngine.py', sim_objects=["BaseApplyEngine"]) +SimObject('BasePushEngine.py', sim_objects=["BasePushEngine"]) +SimObject('BaseWLEngine.py', sim_objects=["BaseWLEngine"]) Source('base_apply_engine.cc') Source('base_push_engine.cc') diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh index b7c0db90cb..fbcf95c238 100644 --- a/src/accl/graph/base/base_apply_engine.hh +++ b/src/accl/graph/base/base_apply_engine.hh @@ -65,6 +65,8 @@ class BaseApplyEngine : public ClockedObject virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0; public: + PARAMS(BaseApplyEngine); + BaseApplyEngine(const BaseApplyEngineParams &apply); Port& getPort(const std::string &if_name, diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh index c723932975..446f6a1186 100644 --- a/src/accl/graph/base/base_push_engine.hh +++ b/src/accl/graph/base/base_push_engine.hh @@ -85,6 +85,8 @@ class BasePushEngine : public ClockedObject public: + PARAMS(BasePushEngine); + BasePushEngine(const BasePushEngineParams ¶ms); Port& getPort(const std::string &if_name, diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh index a2cab4c7e2..4cb492914c 100644 --- a/src/accl/graph/base/base_wl_engine.hh +++ b/src/accl/graph/base/base_wl_engine.hh @@ -104,6 +104,9 @@ class BaseWLEngine : public ClockedObject virtual bool sendWLNotif(Addr addr) = 0; public: + + PARAMS(BaseWLEngine); + BaseWLEngine(const BaseWLEngineParams ¶ms); Port& getPort(const std::string &if_name, diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index dc19ece06b..793dacc2ef 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -27,10 +27,10 @@ Import('*') -SimObject('ApplyEngine.py') -SimObject('MPU.py') -SimObject('PushEngine.py') -SimObject('WLEngine.py') +SimObject('ApplyEngine.py', sim_objects=["ApplyEngine"]) +SimObject('MPU.py', sim_objects=["MPU"]) +SimObject('PushEngine.py', sim_objects=["PushEngine"]) +SimObject('WLEngine.py', sim_objects=["WLEngine"]) Source('apply_engine.cc') Source('mpu.cc') diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh index 17e3280cb5..c7d3073e36 100644 --- a/src/accl/graph/sega/apply_engine.hh +++ b/src/accl/graph/sega/apply_engine.hh @@ -56,7 +56,7 @@ class ApplyEngine : public BaseApplyEngine virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex); public: - + PARAMS(ApplyEngine); ApplyEngine(const ApplyEngineParams ¶ms); }; diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index be5139c0e0..cf241c9063 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -119,7 +119,7 @@ class MPU : public ClockedObject void recvFunctional(PacketPtr pkt); public: - + PARAMS(MPU); MPU(const MPUParams ¶ms); Port& getPort(const std::string &if_name, diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index e4bb83d2bc..1a800e58f3 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -47,6 +47,7 @@ class PushEngine : public BasePushEngine virtual bool sendPushUpdate(PacketPtr pkt); public: + PARAMS(PushEngine); PushEngine(const PushEngineParams ¶ms); Port& getPort(const std::string &if_name, diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index c5f49ff6a2..238ffbe724 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -59,7 +59,7 @@ class WLEngine : public BaseWLEngine virtual bool sendWLNotif(Addr addr); public: - + PARAMS(WLEngine); WLEngine(const WLEngineParams ¶ms); }; From 9a5245c317917f60daf0eb400260ec5b11304f26 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 21 Feb 2022 15:33:13 -0800 Subject: [PATCH 047/247] First compilation after restructure. --- src/accl/graph/base/BaseApplyEngine.py | 1 + src/accl/graph/base/BasePushEngine.py | 1 + src/accl/graph/base/BaseWLEngine.py | 1 + src/accl/graph/base/SConscript | 6 +++--- src/accl/graph/sega/SConscript | 8 ++++---- 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py index e48b425b01..fdabefc732 100644 --- a/src/accl/graph/base/BaseApplyEngine.py +++ b/src/accl/graph/base/BaseApplyEngine.py @@ -30,6 +30,7 @@ from m5.objects.ClockedObject import ClockedObject class BaseApplyEngine(ClockedObject): + abstract = True type = 'BaseApplyEngine' cxx_header = 'accl/graph/base/base_apply_engine.hh' cxx_class = 'gem5::BaseApplyEngine' diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py index 793b0a7c92..d30124a6a4 100644 --- a/src/accl/graph/base/BasePushEngine.py +++ b/src/accl/graph/base/BasePushEngine.py @@ -30,6 +30,7 @@ from m5.objects.ClockedObject import ClockedObject class BasePushEngine(ClockedObject): + abstract = True type = 'BasePushEngine' cxx_header = "accl/graph/base/base_push_engine.hh" cxx_class = 'gem5::BasePushEngine' diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py index 473fd05313..7dcacefd97 100644 --- a/src/accl/graph/base/BaseWLEngine.py +++ b/src/accl/graph/base/BaseWLEngine.py @@ -30,6 +30,7 @@ from m5.objects.ClockedObject import ClockedObject class BaseWLEngine(ClockedObject): + abstract = True type = 'BaseWLEngine' cxx_header = "accl/graph/base/base_wl_engine.hh" cxx_class = 'gem5::BaseWLEngine' diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript index 5e82a44971..cc55100064 100644 --- a/src/accl/graph/base/SConscript +++ b/src/accl/graph/base/SConscript @@ -27,9 +27,9 @@ Import('*') -SimObject('BaseApplyEngine.py', sim_objects=["BaseApplyEngine"]) -SimObject('BasePushEngine.py', sim_objects=["BasePushEngine"]) -SimObject('BaseWLEngine.py', sim_objects=["BaseWLEngine"]) +SimObject('BaseApplyEngine.py') +SimObject('BasePushEngine.py') +SimObject('BaseWLEngine.py') Source('base_apply_engine.cc') Source('base_push_engine.cc') diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index 793dacc2ef..dc19ece06b 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -27,10 +27,10 @@ Import('*') -SimObject('ApplyEngine.py', sim_objects=["ApplyEngine"]) -SimObject('MPU.py', sim_objects=["MPU"]) -SimObject('PushEngine.py', sim_objects=["PushEngine"]) -SimObject('WLEngine.py', sim_objects=["WLEngine"]) +SimObject('ApplyEngine.py') +SimObject('MPU.py') +SimObject('PushEngine.py') +SimObject('WLEngine.py') Source('apply_engine.cc') Source('mpu.cc') From c3b4c743d4953d3648fca7dd384e0f8ed33006f2 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 22 Feb 2022 07:38:41 -0800 Subject: [PATCH 048/247] Adding config file for SEGA and missing ports. --- configs/accl/sega.py | 34 ++++++++++++++++++++++++++++++++++ src/accl/graph/sega/MPU.py | 10 +++++++--- 2 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 configs/accl/sega.py diff --git a/configs/accl/sega.py b/configs/accl/sega.py new file mode 100644 index 0000000000..288b1211e4 --- /dev/null +++ b/configs/accl/sega.py @@ -0,0 +1,34 @@ +import m5 +from m5.objects import * + +class PyMPU(MPU): + def __init__(self, clk_domain): + super().__init__() + self.clk_domain = clk_domain + self.apply_engine = ApplyEngine() + self.push_engine = PushEngine() + self.wl_engine = WLEngine() + +class SEGA(System): + + def __init__(self): + super(SEGA, self).__init__() + # Set up the clock domain and the voltage domain + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = '2GHz' + self.clk_domain.voltage_domain = VoltageDomain() + + self.mpu = PyMPU(self.clk_domain) + self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB"))) + self.mpu.memPort = self.mem_ctrl.port + self.mpu.reqPort = self.mpu.respPort + + +system = SEGA() +root = Root(full_system = False, system = system) + +m5.instantiate() + +exit_event = m5.simulate() +print("Simulation finished!") +exit() \ No newline at end of file diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py index 68cfb3d42d..efd8dbc11f 100644 --- a/src/accl/graph/sega/MPU.py +++ b/src/accl/graph/sega/MPU.py @@ -38,9 +38,13 @@ class MPU(ClockedObject): cxx_header = "accl/graph/sega/mpu.hh" cxx_class = 'gem5::MPU' - work_list_engine = Param.WLEngine("WLEngine object to connect to " + apply_engine = Param.ApplyEngine(NULL, "ApplyEngine object to connect to " "This MPU") - apply_engine = Param.ApplyEngine("ApplyEngine object to connect to " + push_engine = Param.PushEngine(NULL, "PushEngine object to connect to " "This MPU") - push_engine = Param.PushEngine("PushEngine object to connect to " + work_list_engine = Param.WLEngine(NULL, "WLEngine object to connect to " "This MPU") + + respPort = ResponsePort("Port to Receive updates from outside") + reqPort = RequestPort("Port to send updates to the outside") + memPort = RequestPort("Port to communicate with the memory") From 7be5866c0171399e8d5ef6851290dd61e7ef6fc9 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 22 Feb 2022 12:22:14 -0800 Subject: [PATCH 049/247] Adding BaseEngine class and started pointer fix. --- src/accl/graph/base/BaseApplyEngine.py | 4 +- src/accl/graph/base/BaseEngine.py | 38 ++++++++++ src/accl/graph/base/BasePushEngine.py | 2 + src/accl/graph/base/BaseWLEngine.py | 1 + src/accl/graph/base/base_apply_engine.cc | 22 +----- src/accl/graph/base/base_apply_engine.hh | 9 +-- src/accl/graph/base/base_engine.cc | 75 ++++++++++++++++++++ src/accl/graph/base/base_engine.hh | 90 ++++++++++++++++++++++++ src/accl/graph/sega/ApplyEngine.py | 2 +- 9 files changed, 213 insertions(+), 30 deletions(-) create mode 100644 src/accl/graph/base/BaseEngine.py create mode 100644 src/accl/graph/base/base_engine.cc create mode 100644 src/accl/graph/base/base_engine.hh diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py index fdabefc732..be849ed1af 100644 --- a/src/accl/graph/base/BaseApplyEngine.py +++ b/src/accl/graph/base/BaseApplyEngine.py @@ -27,9 +27,9 @@ from m5.params import * from m5.proxy import * -from m5.objects.ClockedObject import ClockedObject +from m5.objects.BaseEngine import BaseEngine -class BaseApplyEngine(ClockedObject): +class BaseApplyEngine(BaseEngine): abstract = True type = 'BaseApplyEngine' cxx_header = 'accl/graph/base/base_apply_engine.hh' diff --git a/src/accl/graph/base/BaseEngine.py b/src/accl/graph/base/BaseEngine.py new file mode 100644 index 0000000000..3eb5f0cbbc --- /dev/null +++ b/src/accl/graph/base/BaseEngine.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject + +class BaseEngine(ClockedObject): + abstract = True + type = 'BaseEngine' + cxx_header = "accl/graph/base/base_engine.hh" + cxx_class = 'gem5::BaseEngine' + + memPort = RequestPort("Port to communicate with the memory") diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py index d30124a6a4..c52a65abf9 100644 --- a/src/accl/graph/base/BasePushEngine.py +++ b/src/accl/graph/base/BasePushEngine.py @@ -34,3 +34,5 @@ class BasePushEngine(ClockedObject): type = 'BasePushEngine' cxx_header = "accl/graph/base/base_push_engine.hh" cxx_class = 'gem5::BasePushEngine' + + memPort = RequestPort("Port to communicate with the memory") diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py index 7dcacefd97..ec34b52005 100644 --- a/src/accl/graph/base/BaseWLEngine.py +++ b/src/accl/graph/base/BaseWLEngine.py @@ -36,3 +36,4 @@ class BaseWLEngine(ClockedObject): cxx_class = 'gem5::BaseWLEngine' wlQueueSize = Param.Unsigned(32, "Size of write queue") + memPort = RequestPort("Port to communicate with the memory") diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc index 731cd5c345..4fd53fb037 100644 --- a/src/accl/graph/base/base_apply_engine.cc +++ b/src/accl/graph/base/base_apply_engine.cc @@ -36,31 +36,12 @@ namespace gem5 { BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams ¶ms): - ClockedObject(params), - requestorId(-1), + BaseEngine(params), queueSize(params.applyQueueSize), nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()), nextApplyEvent([this]{ processNextApplyEvent(); }, name()) {} -Port & -BaseApplyEngine::getPort(const std::string &if_name, PortID idx) -{ - return SimObject::getPort(if_name, idx); -} - -RequestorID -BaseApplyEngine::getRequestorId() -{ - return requestorId; -} - -void -BaseApplyEngine::setRequestorId(RequestorID requestorId) -{ - this->requestorId = requestorId; -} - bool BaseApplyEngine::recvWLNotif(Addr addr){ // TODO: Investigate the situation where the queue is full. // if (applyReadQueue.size() == queueSize){ @@ -82,6 +63,7 @@ void BaseApplyEngine::processNextApplyCheckEvent(){ RequestPtr request = std::make_shared(req_addr, 64, 0 ,0); PacketPtr memPkt = new Packet(request, MemCmd::ReadReq); requestOffset[request] = req_offset; + // FIXME: sendMemReq returns void, use memPortBlocked to check instead. if (sendMemReq(memPkt)){ applyReadQueue.pop(); } diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh index fbcf95c238..f81f23428e 100644 --- a/src/accl/graph/base/base_apply_engine.hh +++ b/src/accl/graph/base/base_apply_engine.hh @@ -32,6 +32,7 @@ #include #include +#include "accl/graph/base/base_engine.hh" #include "mem/packet.hh" #include "mem/port.hh" #include "mem/request.hh" @@ -42,12 +43,10 @@ namespace gem5 { -class BaseApplyEngine : public ClockedObject +class BaseApplyEngine : public BaseEngine { private: - RequestorID requestorId; - std::queue applyReadQueue; std::queue applyWriteQueue; int queueSize; @@ -61,7 +60,6 @@ class BaseApplyEngine : public ClockedObject void processNextApplyEvent(); protected: - virtual bool sendMemReq(PacketPtr pkt) = 0; virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0; public: @@ -72,9 +70,6 @@ class BaseApplyEngine : public ClockedObject Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; - RequestorID getRequestorId(); - void setRequestorId(RequestorID requestorId); - bool recvWLNotif(Addr addr); bool handleMemResp(PacketPtr resp); }; diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc new file mode 100644 index 0000000000..d53e2e683a --- /dev/null +++ b/src/accl/graph/base/base_engine.cc @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/base/base_engine.hh" + +namespace gem5 +{ + +BaseEngine::BaseEngine(const BaseEngineParams ¶ms) : + ClockedObject(params), + system(params.system), + requestorId(system->getRequestorId()), + memPort(name() + ".memPort", this) +{} + + +void +BaseEngine::MemPort::sendPacket(PacketPtr pkt) +{ + panic_if(_blocked, "Should never try to send if blocked MemSide!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + blockedPacket = pkt; + _blocked = true; + } +} + +bool +BaseEngine::MemPort::recvTimingResp(PacketPtr pkt) +{ + //TODO: Investigate sending true all the time + return owner->handleMemResp(pkt); + +} + +void +BaseEngine::MemPort::recvReqRetry() +{ + panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); + + _blocked = false; + sendPacket(blockedPacket); + + if (!blocked()) { + blockedPacket = nullptr; + } +} + +} diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh new file mode 100644 index 0000000000..f9f500e118 --- /dev/null +++ b/src/accl/graph/base/base_engine.hh @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_BASE_BASE_ENGINE_HH__ +#define __ACCL_GRAPH_BASE_BASE_ENGINE_HH__ + +#include +#include + +#include "mem/packet.hh" +#include "mem/port.hh" +#include "mem/request.hh" +#include "params/BaseEngine.hh" +#include "sim/clocked_object.hh" +#include "sim/port.hh" +#include "sim/system.hh" + +namespace gem5 +{ + +class BaseEngine : public ClockedObject +{ + private: + class MemPort : public RequestPort + { + private: + BaseEngine* owner; + bool _blocked; + PacketPtr blockedPacket; + + public: + MemPort(const std::string& name, BaseEngine* owner): + RequestPort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} + + void sendPacket(PacketPtr pkt); + bool blocked() { return _blocked; } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + System* system; + const RequestorID requestorId; + MemPort memPort; + + protected: + bool memPortBlocked() { return memPort.blocked(); } + void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); } + virtual bool handleMemResp(PacketPtr resp) = 0; + + public: + PARAMS(BaseEngine); + + BaseEngine(const BaseEngineParams ¶ms); + + Port& getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; + +}; + +} + +#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__ diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/ApplyEngine.py index bb43836ff7..5bb0dc0c25 100644 --- a/src/accl/graph/sega/ApplyEngine.py +++ b/src/accl/graph/sega/ApplyEngine.py @@ -34,4 +34,4 @@ class ApplyEngine(BaseApplyEngine): cxx_header = "accl/graph/sega/apply_engine.hh" cxx_class = 'gem5::ApplyEngine' - mpu = Param.MPU(Parent.any, "MPU object that owns this ApplyEngine") + push_engine = Param.PushEngine(Parent.any, "MPU object that owns this ApplyEngine") From 1bf60b6fa044f8913814d4234e4a209f6076fa1d Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 22 Feb 2022 21:44:29 -0800 Subject: [PATCH 050/247] Cont. fixing pointer issue. --- src/accl/graph/base/BaseApplyEngine.py | 2 - src/accl/graph/base/BaseWLEngine.py | 7 +-- src/accl/graph/base/base_apply_engine.cc | 53 ++++++++-------- src/accl/graph/base/base_apply_engine.hh | 4 +- src/accl/graph/base/base_wl_engine.cc | 79 +++++++----------------- src/accl/graph/base/base_wl_engine.hh | 52 +++------------- 6 files changed, 63 insertions(+), 134 deletions(-) diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py index be849ed1af..9b240581ac 100644 --- a/src/accl/graph/base/BaseApplyEngine.py +++ b/src/accl/graph/base/BaseApplyEngine.py @@ -34,5 +34,3 @@ class BaseApplyEngine(BaseEngine): type = 'BaseApplyEngine' cxx_header = 'accl/graph/base/base_apply_engine.hh' cxx_class = 'gem5::BaseApplyEngine' - - applyQueueSize = Param.Unsigned(32, "Size of write queue") diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py index ec34b52005..7311c396b3 100644 --- a/src/accl/graph/base/BaseWLEngine.py +++ b/src/accl/graph/base/BaseWLEngine.py @@ -27,13 +27,10 @@ from m5.params import * from m5.proxy import * -from m5.objects.ClockedObject import ClockedObject +from m5.objects.BaseEngine import BaseEngine -class BaseWLEngine(ClockedObject): +class BaseWLEngine(BaseEngine): abstract = True type = 'BaseWLEngine' cxx_header = "accl/graph/base/base_wl_engine.hh" cxx_class = 'gem5::BaseWLEngine' - - wlQueueSize = Param.Unsigned(32, "Size of write queue") - memPort = RequestPort("Port to communicate with the memory") diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc index 4fd53fb037..7f6c32cf39 100644 --- a/src/accl/graph/base/base_apply_engine.cc +++ b/src/accl/graph/base/base_apply_engine.cc @@ -37,34 +37,35 @@ namespace gem5 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams ¶ms): BaseEngine(params), - queueSize(params.applyQueueSize), nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()), nextApplyEvent([this]{ processNextApplyEvent(); }, name()) {} -bool BaseApplyEngine::recvWLNotif(Addr addr){ +bool +BaseApplyEngine::recvWLNotif(Addr addr) +{ // TODO: Investigate the situation where the queue is full. - // if (applyReadQueue.size() == queueSize){ - // // applyReadQueue.sendPktRetry = true; - // return true; - // } else{ applyReadQueue.push(addr); - // } if (!nextApplyCheckEvent.scheduled()){ schedule(nextApplyCheckEvent, nextCycle()); } return true; } -void BaseApplyEngine::processNextApplyCheckEvent(){ +void +BaseApplyEngine::processNextApplyCheckEvent() +{ + // TODO: We might want to change the way this function + // pops items off queue, maybe we should pop every n cycles + // or change the clock domain for this simobject. Addr addr = applyReadQueue.front(); Addr req_addr = (addr / 64) * 64; int req_offset = (addr % 64); RequestPtr request = std::make_shared(req_addr, 64, 0 ,0); PacketPtr memPkt = new Packet(request, MemCmd::ReadReq); requestOffset[request] = req_offset; - // FIXME: sendMemReq returns void, use memPortBlocked to check instead. - if (sendMemReq(memPkt)){ + if (!memPortBlocked()) { + sendMemReq(memPkt); applyReadQueue.pop(); } if (!applyReadQueue.empty() && !nextApplyCheckEvent.scheduled()){ @@ -75,7 +76,6 @@ void BaseApplyEngine::processNextApplyCheckEvent(){ bool BaseApplyEngine::handleMemResp(PacketPtr pkt) { - // FIXME: change the event, remove the retry parts applyWriteQueue.push(pkt); if(!nextApplyEvent.scheduled()){ schedule(nextApplyEvent, nextCycle()); @@ -84,38 +84,39 @@ BaseApplyEngine::handleMemResp(PacketPtr pkt) } void -BaseApplyEngine::processNextApplyEvent(){ +BaseApplyEngine::processNextApplyEvent() +{ PacketPtr pkt = applyWriteQueue.front(); uint8_t* data = pkt->getPtr(); RequestPtr request = pkt->req; int request_offset = requestOffset[request]; - WorkListItem wl = memoryToWorkList(data + request_offset); - uint32_t prop = wl.prop; - uint32_t temp_prop = wl.temp_prop; - if (temp_prop != prop) { + WorkListItem wl = memoryToWorkList(data + request_offset); + // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem + // to applyengine if temp_prop < prop. If temp_prop has not changed, why + // fwd it to applyengine? + if (wl.temp_prop < wl.prop) { // TODO: instead of min add a Reduce function. //update prop with temp_prop - if(prop < temp_prop) { - wl.prop = prop; - }else { - wl.prop = temp_prop; - } + wl.prop = wl.temp_prop; //write back the new worklist item to memory uint8_t* wList = workListToMemory(wl); memcpy(data + request_offset, wList, sizeof(WorkListItem)); //Create memory write requests. PacketPtr writePkt = getWritePacket(pkt->getAddr(), 64, data, requestorId); - if (sendMemReq(writePkt) && - sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) { - applyWriteQueue.pop(); + + if (!memPortBlocked()) { + if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) { + sendMemReq(writePkt); + applyWriteQueue.pop(); + } } - }else { + } else { applyWriteQueue.pop(); } - if(!applyWriteQueue.empty() && !nextApplyEvent.scheduled()){ + if (!applyWriteQueue.empty() && !nextApplyEvent.scheduled()){ schedule(nextApplyEvent, nextCycle()); } } diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh index f81f23428e..dc7188ab56 100644 --- a/src/accl/graph/base/base_apply_engine.hh +++ b/src/accl/graph/base/base_apply_engine.hh @@ -37,7 +37,6 @@ #include "mem/port.hh" #include "mem/request.hh" #include "params/BaseApplyEngine.hh" -#include "sim/clocked_object.hh" #include "sim/port.hh" namespace gem5 @@ -60,6 +59,7 @@ class BaseApplyEngine : public BaseEngine void processNextApplyEvent(); protected: + virtual bool handleMemResp(PacketPtr pkt); virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0; public: @@ -71,7 +71,7 @@ class BaseApplyEngine : public BaseEngine PortID idx=InvalidPortID) override; bool recvWLNotif(Addr addr); - bool handleMemResp(PacketPtr resp); + }; } diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc index 806ab4a6c3..aab39fb7a3 100644 --- a/src/accl/graph/base/base_wl_engine.cc +++ b/src/accl/graph/base/base_wl_engine.cc @@ -34,61 +34,37 @@ namespace gem5 { BaseWLEngine::BaseWLEngine(const BaseWLEngineParams ¶ms): - ClockedObject(params), - requestorId(-1), - updateQueue(params.wlQueueSize), - responseQueue(params.wlQueueSize), + BaseEngine(params), nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()), nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name()) {} -Port & -BaseWLEngine::getPort(const std::string &if_name, PortID idx) -{ - return SimObject::getPort(if_name, idx); -} - -RequestorID -BaseWLEngine::getRequestorId() +bool +BaseWLEngine::handleWLUpdate(PacketPtr pkt) { - return requestorId; + updateQueue.push(pkt); + if(!nextWLReadEvent.scheduled()) { + schedule(nextWLReadEvent, nextCycle()); + } + return true; } -void -BaseWLEngine::setRequestorId(RequestorID requestorId) +void BaseWLEngine::processNextWLReadEvent() { - this->requestorId = requestorId; -} + PacketPtr pkt = updateQueue.front(); -bool -BaseWLEngine::handleWLUpdate(PacketPtr pkt){ - auto queue = updateQueue; - if (queue.blocked()){ - queue.sendPktRetry = true; - return false; - } else - queue.push(pkt); + Addr addr = pkt->getAddr(); + Addr req_addr = (addr / 64) * 64; + Addr req_offset = addr % 64; - if(!nextWLReadEvent.scheduled()){ - schedule(nextWLReadEvent, nextCycle()); - } - return true; -} + PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId); + requestOffsetMap[request] = req_offset; -void BaseWLEngine::processNextWLReadEvent(){ - auto queue = updateQueue; - PacketPtr pkt = queue.front(); - /// conver to ReadReq - Addr req_addr = (pkt->getAddr() / 64) * 64; - int req_offset = (pkt->getAddr()) % 64; - RequestPtr request = - std::make_shared(req_addr, 64, 0 ,0); - PacketPtr memPkt = new Packet(request, MemCmd::ReadReq); - requestOffset[request] = req_offset; - if (sendMemReq(memPkt)){ - queue.pop(); + if (memPortBlocked()) { + sendMemReq(memPkt) + updateQueue.pop(); } - if(!queue.empty() && !nextWLReadEvent.scheduled()){ + if (!queue.empty() && !nextWLReadEvent.scheduled()) { schedule(nextWLReadEvent, nextCycle()); } } @@ -96,24 +72,15 @@ void BaseWLEngine::processNextWLReadEvent(){ bool BaseWLEngine::handleMemResp(PacketPtr pkt) { - auto queue = responseQueue; - if (queue.blocked()){ - queue.sendPktRetry = true; - return false; - } else{ - queue.push(pkt); - } - if(!nextWLReduceEvent.scheduled()){ - schedule(nextWLReduceEvent, nextCycle()); - } - return true; + responseQueue.push(pkt); + if(!nextWLReduceEvent.scheduled()){ + schedule(nextWLReduceEvent, nextCycle()); + } return true; } void BaseWLEngine::processNextWLReduceEvent(){ - auto queue = responseQueue; - auto updateQ = updateQueue; PacketPtr update = updateQ.front(); uint8_t* value = update->getPtr(); PacketPtr pkt = queue.front(); diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh index 4cb492914c..063e9909be 100644 --- a/src/accl/graph/base/base_wl_engine.hh +++ b/src/accl/graph/base/base_wl_engine.hh @@ -32,57 +32,26 @@ #include #include +#include "accl/graph/base/base_engine.hh" #include "accl/graph/base/util.hh" #include "base/addr_range.hh" #include "mem/port.hh" #include "mem/packet.hh" #include "params/BaseWLEngine.hh" -#include "sim/clocked_object.hh" #include "sim/port.hh" #include "sim/system.hh" namespace gem5 { -class BaseWLEngine : public ClockedObject +class BaseWLEngine : public BaseEngine { private: - //FIXME: Change this - struct WLQueue{ - std::queue wlQueue; - uint32_t queueSize; - bool sendPktRetry; - - void resize(uint32_t size){ - queueSize = size; - } - - bool blocked(){ - return (wlQueue.size() == queueSize); - } - bool empty(){ - return wlQueue.empty(); - } - void push(PacketPtr pkt){ - wlQueue.push(pkt); - } - void pop(){ - wlQueue.pop(); - } - PacketPtr front(){ - return wlQueue.front(); - } - - WLQueue(uint32_t qSize): - queueSize(qSize), - sendPktRetry(false){} - }; - - RequestorID requestorId; - WLQueue updateQueue; - WLQueue responseQueue; - - std::unordered_map requestOffset; + std::queue updateQueue; + std::queue responseQueue; + + std::unordered_map requestOffsetMap; + std::unordered_map requestValueMap; //Events EventFunctionWrapper nextWLReadEvent; @@ -100,7 +69,7 @@ class BaseWLEngine : public ClockedObject Write edgelist loc in buffer */ protected: - virtual bool sendMemReq(PacketPtr pkt) = 0; + virtual bool handleMemResp(PacketPtr resp); virtual bool sendWLNotif(Addr addr) = 0; public: @@ -112,11 +81,8 @@ class BaseWLEngine : public ClockedObject Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; - RequestorID getRequestorId(); - void setRequestorId(RequestorID requestorId); - bool handleWLUpdate(PacketPtr pkt); - bool handleMemResp(PacketPtr resp); + }; } From a8a3d0dc91778cbb21553938f7b3840e2d2af979 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 23 Feb 2022 10:16:01 -0800 Subject: [PATCH 051/247] Cont. fix pointer issue. --- src/accl/graph/base/BasePushEngine.py | 6 +-- src/accl/graph/base/base_apply_engine.hh | 1 - src/accl/graph/base/base_push_engine.cc | 19 ------- src/accl/graph/base/base_push_engine.hh | 19 ++----- src/accl/graph/base/base_wl_engine.cc | 64 +++++++++--------------- 5 files changed, 31 insertions(+), 78 deletions(-) diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py index c52a65abf9..2163864be3 100644 --- a/src/accl/graph/base/BasePushEngine.py +++ b/src/accl/graph/base/BasePushEngine.py @@ -27,12 +27,10 @@ from m5.params import * from m5.proxy import * -from m5.objects.ClockedObject import ClockedObject +from m5.objects.BaseEngine import BaseEngine -class BasePushEngine(ClockedObject): +class BasePushEngine(BaseEngine): abstract = True type = 'BasePushEngine' cxx_header = "accl/graph/base/base_push_engine.hh" cxx_class = 'gem5::BasePushEngine' - - memPort = RequestPort("Port to communicate with the memory") diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh index dc7188ab56..2cb9d8b918 100644 --- a/src/accl/graph/base/base_apply_engine.hh +++ b/src/accl/graph/base/base_apply_engine.hh @@ -48,7 +48,6 @@ class BaseApplyEngine : public BaseEngine std::queue applyReadQueue; std::queue applyWriteQueue; - int queueSize; std::unordered_map requestOffset; diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc index d93cbdf8da..f2384c434b 100644 --- a/src/accl/graph/base/base_push_engine.cc +++ b/src/accl/graph/base/base_push_engine.cc @@ -35,7 +35,6 @@ namespace gem5 BasePushEngine::BasePushEngine(const BasePushEngineParams ¶ms) : ClockedObject(params), - requestorId(-1), // vertexQueueSize(params.vertex_queue_size), // vertexQueueLen(0), // updateQueue(params.update_queue_size), @@ -46,24 +45,6 @@ BasePushEngine::BasePushEngine(const BasePushEngineParams ¶ms) : { } -Port & -BasePushEngine::getPort(const std::string &if_name, PortID idx) -{ - return SimObject::getPort(if_name, idx); -} - -RequestorID -BasePushEngine::getRequestorId() -{ - return requestorId; -} - -void -BasePushEngine::setRequestorId(RequestorID requestorId) -{ - this->requestorId = requestorId; -} - bool BasePushEngine::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index) diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh index 446f6a1186..f568b6ecc3 100644 --- a/src/accl/graph/base/base_push_engine.hh +++ b/src/accl/graph/base/base_push_engine.hh @@ -31,16 +31,16 @@ #include +#include "accl/graph/base/base_engine.hh" #include "mem/port.hh" #include "mem/request.hh" #include "mem/packet.hh" #include "params/BasePushEngine.hh" -#include "sim/clocked_object.hh" namespace gem5 { -class BasePushEngine : public ClockedObject +class BasePushEngine : public BaseEngine { private: @@ -53,9 +53,6 @@ class BasePushEngine : public ClockedObject prop(prop), degree(degree), edgeIndex(edge_index) {} }; - - RequestorID requestorId; - std::queue notifQueue; // int vertexQueueSize; // int vertexQueueLen; @@ -64,8 +61,6 @@ class BasePushEngine : public ClockedObject std::unordered_map reqNumEdgeMap; std::unordered_map reqValueMap; - std::queue memReqQueue; // Infinite queueing? - std::queue updateQueue; // int updateQueueSize; // int updateQueueLen; @@ -80,8 +75,8 @@ class BasePushEngine : public ClockedObject void processNextSendEvent(); protected: - virtual bool sendMemReq(PacketPtr pkt) = 0; virtual bool sendPushUpdate(PacketPtr pkt) = 0; + virtual bool handleMemResp(PacketPtr pkt); public: @@ -89,14 +84,8 @@ class BasePushEngine : public ClockedObject BasePushEngine(const BasePushEngineParams ¶ms); - Port& getPort(const std::string &if_name, - PortID idx=InvalidPortID) override; - - RequestorID getRequestorId(); - void setRequestorId(RequestorID requestorId); - bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index); - bool handleMemResp(PacketPtr pkt); + }; } diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc index aab39fb7a3..d5b18bafa0 100644 --- a/src/accl/graph/base/base_wl_engine.cc +++ b/src/accl/graph/base/base_wl_engine.cc @@ -52,13 +52,15 @@ BaseWLEngine::handleWLUpdate(PacketPtr pkt) void BaseWLEngine::processNextWLReadEvent() { PacketPtr pkt = updateQueue.front(); + uint32_t data = *(pkt->getPtr()); Addr addr = pkt->getAddr(); Addr req_addr = (addr / 64) * 64; Addr req_offset = addr % 64; PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId); - requestOffsetMap[request] = req_offset; + requestOffsetMap[memPkt->req] = req_offset; + requestValueMap[memPkt->req] = value; if (memPortBlocked()) { sendMemReq(memPkt) @@ -80,51 +82,35 @@ BaseWLEngine::handleMemResp(PacketPtr pkt) } void -BaseWLEngine::processNextWLReduceEvent(){ - PacketPtr update = updateQ.front(); - uint8_t* value = update->getPtr(); - PacketPtr pkt = queue.front(); - uint8_t* data = pkt->getPtr(); - RequestPtr request = pkt->req; - int request_offset = requestOffset[request]; +BaseWLEngine::processNextWLReduceEvent() +{ + PacketPtr resp = responseQueue.front(); + uint8_t* respData = resp->getPtr(); + Addr request_offset = requestOffsetMap[resp->req]; + uint32_t value = requestValueMap[resp->req]; WorkListItem wl = memoryToWorkList(data + request_offset); - uint32_t temp_prop = wl.temp_prop; - if (temp_prop != *value){ + + if (value < wl.temp_prop){ //update prop with temp_prop - if(*value < temp_prop){ - temp_prop = *value; - } - // if (!memPort.blocked() && !applyPort.blocked()){ - wl.temp_prop = temp_prop; - uint8_t* wlItem = workListToMemory(wl); - memcpy(data + request_offset, wlItem, sizeof(WorkListItem)); + wl.temp_prop = value; + + uint8_t* wlData = workListToMemory(wl); + memcpy(respData + request_offset, wlData, sizeof(WorkListItem)); PacketPtr writePkt = - getWritePacket(pkt->getAddr(), 64, data, requestorId); - if (sendMemReq(writePkt) && - sendWLNotif(writePkt->getAddr())) { - queue.pop(); - if (!queue.blocked() && queue.sendPktRetry){ - queue.sendPktRetry = false; - } - updateQ.pop(); - if (!updateQ.blocked() & updateQ.sendPktRetry){ - // respPort.trySendRetry(); - updateQ.sendPktRetry = false; + getWritePacket(pkt->getAddr(), 64, respData, requestorId); + + if (!memPortBlocked()) { + if (sendWLNotif(pkt->getAddr() + request_offset)) { + sendMemReq(writePkt); + responseQueue.pop(); + // TODO: Erase map entries, delete wlData; } } } - else{ - queue.pop(); - if (!queue.blocked() && queue.sendPktRetry){ - queue.sendPktRetry = false; - } - updateQ.pop(); - if (!updateQ.blocked() & updateQ.sendPktRetry){ - updateQ.sendPktRetry = false; - } - + else { + responseQueue.pop(); } - if (!queue.empty() && !nextWLReduceEvent.scheduled()){ + if (!responseQueue.empty() && !nextWLReduceEvent.scheduled()){ schedule(nextWLReduceEvent, nextCycle()); } } From 5a595540a569128ec01d730c25f4091a0a7c3a6f Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 23 Feb 2022 12:07:21 -0800 Subject: [PATCH 052/247] Cont. fix pointer issue. MemQ to BaseEngine. --- src/accl/graph/base/base_apply_engine.cc | 22 ++----- src/accl/graph/base/base_apply_engine.hh | 11 +--- src/accl/graph/base/base_engine.cc | 13 +++- src/accl/graph/base/base_engine.hh | 17 +++++- src/accl/graph/base/base_push_engine.cc | 77 ++++++------------------ src/accl/graph/base/base_push_engine.hh | 16 +---- src/accl/graph/base/base_wl_engine.cc | 22 ++----- src/accl/graph/base/base_wl_engine.hh | 3 +- src/accl/graph/sega/mpu.hh | 2 - 9 files changed, 65 insertions(+), 118 deletions(-) diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc index 7f6c32cf39..842481c2d1 100644 --- a/src/accl/graph/base/base_apply_engine.cc +++ b/src/accl/graph/base/base_apply_engine.cc @@ -73,20 +73,10 @@ BaseApplyEngine::processNextApplyCheckEvent() } } -bool -BaseApplyEngine::handleMemResp(PacketPtr pkt) -{ - applyWriteQueue.push(pkt); - if(!nextApplyEvent.scheduled()){ - schedule(nextApplyEvent, nextCycle()); - } - return true; -} - void -BaseApplyEngine::processNextApplyEvent() +BaseApplyEngine::processNextMemRespEvent() { - PacketPtr pkt = applyWriteQueue.front(); + PacketPtr pkt = memRespQueue.front(); uint8_t* data = pkt->getPtr(); RequestPtr request = pkt->req; @@ -110,14 +100,14 @@ BaseApplyEngine::processNextApplyEvent() if (!memPortBlocked()) { if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) { sendMemReq(writePkt); - applyWriteQueue.pop(); + memRespQueue.pop(); } } } else { - applyWriteQueue.pop(); + memRespQueue.pop(); } - if (!applyWriteQueue.empty() && !nextApplyEvent.scheduled()){ - schedule(nextApplyEvent, nextCycle()); + if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()){ + schedule(nextMemRespEvent, nextCycle()); } } diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh index 2cb9d8b918..02646a74ff 100644 --- a/src/accl/graph/base/base_apply_engine.hh +++ b/src/accl/graph/base/base_apply_engine.hh @@ -45,21 +45,17 @@ namespace gem5 class BaseApplyEngine : public BaseEngine { private: - std::queue applyReadQueue; - std::queue applyWriteQueue; std::unordered_map requestOffset; EventFunctionWrapper nextApplyCheckEvent; void processNextApplyCheckEvent(); - EventFunctionWrapper nextApplyEvent; - void processNextApplyEvent(); - protected: - virtual bool handleMemResp(PacketPtr pkt); - virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0; + virtual bool sendApplyNotif(uint32_t prop, + uint32_t degree, uint32_t edgeIndex) = 0; + virtual void processNextMemRespEvent(); public: PARAMS(BaseApplyEngine); @@ -70,7 +66,6 @@ class BaseApplyEngine : public BaseEngine PortID idx=InvalidPortID) override; bool recvWLNotif(Addr addr); - }; } diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc index d53e2e683a..6a50e1630e 100644 --- a/src/accl/graph/base/base_engine.cc +++ b/src/accl/graph/base/base_engine.cc @@ -35,7 +35,8 @@ BaseEngine::BaseEngine(const BaseEngineParams ¶ms) : ClockedObject(params), system(params.system), requestorId(system->getRequestorId()), - memPort(name() + ".memPort", this) + memPort(name() + ".memPort", this), + nextMemRespEvent([this] { processNextMemRespEvent(); }, name()) {} @@ -72,4 +73,14 @@ BaseEngine::MemPort::recvReqRetry() } } +bool +BaseEngine::handleMemResp(PacketPtr pkt) +{ + memRespQueue.push(pkt); + if (!nextMemResponseEvent.scheduled() && !memRespQueue.empty()) { + schedule(nextMemResponseEvent, nextCycle()); + } + return true; +} + } diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh index f9f500e118..4f5a29676d 100644 --- a/src/accl/graph/base/base_engine.hh +++ b/src/accl/graph/base/base_engine.hh @@ -66,14 +66,28 @@ class BaseEngine : public ClockedObject virtual bool recvTimingResp(PacketPtr pkt); virtual void recvReqRetry(); }; + System* system; const RequestorID requestorId; MemPort memPort; + bool handleMemResp(PacketPtr resp); + EventFunctionWrapper nextMemRespEvent; + protected: bool memPortBlocked() { return memPort.blocked(); } void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); } - virtual bool handleMemResp(PacketPtr resp) = 0; + + // TODO: Add this later, maybe? + // int memRespQueueSize; + std::queue memRespQueue; + /* Respective function for nextMemRespEvent. + All the classes inheriting from this class will + do their main processing in this function. For + example, BaseWLEngine reduces the temp_pro with + the value of update in this function. + */ + virtual void processNextMemRespEvent() = 0; public: PARAMS(BaseEngine); @@ -82,7 +96,6 @@ class BaseEngine : public ClockedObject Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; - }; } diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc index f2384c434b..4c43f95939 100644 --- a/src/accl/graph/base/base_push_engine.cc +++ b/src/accl/graph/base/base_push_engine.cc @@ -40,7 +40,6 @@ BasePushEngine::BasePushEngine(const BasePushEngineParams ¶ms) : // updateQueue(params.update_queue_size), // updateQueueLen(0), nextReceiveEvent([this] { processNextReceiveEvent(); }, name()), - nextReadEvent([this] { processNextReadEvent(); }, name()), nextSendEvent([this] { processNextSendEvent(); }, name()) { } @@ -49,16 +48,6 @@ bool BasePushEngine::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index) { - //FIXME: There should be a check if the queues are full. - // if (vertexQueueLen < vertexQueueSize) { - // vertexQueue.push(pkt) - // vertexQueueLen++; - // if (!nextReceiveEvent.scheduled()) { - // schedule(nextReceiveEvent, nextCycle()); - // } - // return true; - // } - // return false; notifQueue.emplace(prop, degree, edge_index); if (!nextReceiveEvent.scheduled()) { schedule(nextReceiveEvent, nextCycle()); @@ -67,7 +56,7 @@ BasePushEngine::recvApplyNotif(uint32_t prop, } void -BasePushEngine::processNextReceiveEvent() +BasePushEngine::processNextReadEvent() { ApplyNotif notif = notifQueue.front(); @@ -95,39 +84,28 @@ BasePushEngine::processNextReceiveEvent() offset_queue.push_back(req_offset); num_edge_queue.push_back(1); } - } + }; for (int index = 0; index < addr_queue.size(); index++) { - PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId); - memReqQueue.push(pkt); - reqOffsetMap[pkt->req] = offset_queue[index]; - reqNumEdgeMap[pkt->req] = num_edge_queue[index]; - reqValueMap[pkt->req] = notif.prop; + if (!memPortBlocked()) { + PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId); + reqOffsetMap[pkt->req] = offset_queue[index]; + reqNumEdgeMap[pkt->req] = num_edge_queue[index]; + reqValueMap[pkt->req] = notif.prop; + sendMemReq(pkt); + notifQueue.pop(); + } } - notifQueue.pop(); - - if (!nextReadEvent.scheduled() && !memReqQueue.empty()) { + if (!nextReadEvent.scheduled() && !notifQueue.empty()) { schedule(nextReadEvent, nextCycle()); } } void -BasePushEngine::processNextReadEvent() -{ - PacketPtr pkt = memReqQueue.front(); - if (!sendMemReq(pkt)) { - memReqQueue.pop(); - } - - if (!nextReadEvent.scheduled() && !memReqQueue.empty()) { - schedule(nextReadEvent, nextCycle()); - } -} - -bool -BasePushEngine::handleMemResp(PacketPtr pkt) +BasePushEngine::processNextMemRespEvent() { + PacketPtr pkt = memRespQueue.front(); RequestPtr req = pkt->req; uint8_t *data = pkt->getPtr(); @@ -137,7 +115,7 @@ BasePushEngine::handleMemResp(PacketPtr pkt) int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t); for (int i = 0; i < num_edges; i++) { - uint8_t *curr_edge_data = data + offset + i * edge_in_bytes; + uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes); Edge e = memoryToEdge(curr_edge_data); uint32_t *update_data = new uint32_t; @@ -146,29 +124,14 @@ BasePushEngine::handleMemResp(PacketPtr pkt) PacketPtr update = getUpdatePacket(e.neighbor, sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data, requestorId); - updateQueue.push(update); - } - - if (!nextSendEvent.scheduled() && !updateQueue.empty()) { - schedule(nextSendEvent, nextCycle()); - } - - //TODO: Should we always return true? It's the response from the memory - // so maybe yes. We assume the receiving bandwidth of the PushEngine is - // higher than its demand bandwidth - return true; -} - -void -BasePushEngine::processNextSendEvent() -{ - PacketPtr pkt = updateQueue.front(); - if (!sendPushUpdate(pkt)) { - updateQueue.pop(); + if (sendPushUpdate(update)) { + memRespQueue.pop(); + // TODO: Erase map entries here. + } } - if (!nextSendEvent.scheduled() && !updateQueue.empty()) { - schedule(nextSendEvent, nextCycle()); + if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()) { + schedule(nextMemRespEvent, nextCycle()); } } diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh index f568b6ecc3..5a6ef85b0f 100644 --- a/src/accl/graph/base/base_push_engine.hh +++ b/src/accl/graph/base/base_push_engine.hh @@ -43,7 +43,6 @@ namespace gem5 class BasePushEngine : public BaseEngine { private: - struct ApplyNotif { uint32_t prop; uint32_t degree; @@ -53,30 +52,20 @@ class BasePushEngine : public BaseEngine prop(prop), degree(degree), edgeIndex(edge_index) {} }; + std::queue notifQueue; // int vertexQueueSize; - // int vertexQueueLen; std::unordered_map reqOffsetMap; std::unordered_map reqNumEdgeMap; std::unordered_map reqValueMap; - std::queue updateQueue; - // int updateQueueSize; - // int updateQueueLen; - - EventFunctionWrapper nextReceiveEvent; - void processNextReceiveEvent(); - EventFunctionWrapper nextReadEvent; void processNextReadEvent(); - EventFunctionWrapper nextSendEvent; - void processNextSendEvent(); - protected: virtual bool sendPushUpdate(PacketPtr pkt) = 0; - virtual bool handleMemResp(PacketPtr pkt); + virtual void processNextMemRespEvent(); public: @@ -85,7 +74,6 @@ class BasePushEngine : public BaseEngine BasePushEngine(const BasePushEngineParams ¶ms); bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index); - }; } diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc index d5b18bafa0..5d84e34ccd 100644 --- a/src/accl/graph/base/base_wl_engine.cc +++ b/src/accl/graph/base/base_wl_engine.cc @@ -63,7 +63,7 @@ void BaseWLEngine::processNextWLReadEvent() requestValueMap[memPkt->req] = value; if (memPortBlocked()) { - sendMemReq(memPkt) + sendMemReq(memPkt); updateQueue.pop(); } if (!queue.empty() && !nextWLReadEvent.scheduled()) { @@ -71,20 +71,10 @@ void BaseWLEngine::processNextWLReadEvent() } } -bool -BaseWLEngine::handleMemResp(PacketPtr pkt) -{ - responseQueue.push(pkt); - if(!nextWLReduceEvent.scheduled()){ - schedule(nextWLReduceEvent, nextCycle()); - } - return true; -} - void -BaseWLEngine::processNextWLReduceEvent() +BaseWLEngine::processNextMemRespEvent() { - PacketPtr resp = responseQueue.front(); + PacketPtr resp = memRespQueue.front(); uint8_t* respData = resp->getPtr(); Addr request_offset = requestOffsetMap[resp->req]; uint32_t value = requestValueMap[resp->req]; @@ -102,15 +92,15 @@ BaseWLEngine::processNextWLReduceEvent() if (!memPortBlocked()) { if (sendWLNotif(pkt->getAddr() + request_offset)) { sendMemReq(writePkt); - responseQueue.pop(); + memRespQueue.pop(); // TODO: Erase map entries, delete wlData; } } } else { - responseQueue.pop(); + memRespQueue.pop(); } - if (!responseQueue.empty() && !nextWLReduceEvent.scheduled()){ + if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()){ schedule(nextWLReduceEvent, nextCycle()); } } diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh index 063e9909be..ab8952de41 100644 --- a/src/accl/graph/base/base_wl_engine.hh +++ b/src/accl/graph/base/base_wl_engine.hh @@ -69,8 +69,8 @@ class BaseWLEngine : public BaseEngine Write edgelist loc in buffer */ protected: - virtual bool handleMemResp(PacketPtr resp); virtual bool sendWLNotif(Addr addr) = 0; + virtual void processNextMemRespEvent(); public: @@ -82,7 +82,6 @@ class BaseWLEngine : public BaseEngine PortID idx=InvalidPortID) override; bool handleWLUpdate(PacketPtr pkt); - }; } diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index cf241c9063..8b5ba20b1c 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -129,8 +129,6 @@ class MPU : public ClockedObject void handleMemResp(PacketPtr pkt); bool handleWLUpdate(PacketPtr pkt); - bool recvWLNotif(Addr addr); - bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index); bool recvPushUpdate(PacketPtr pkt); }; From 4d2ad56c51ecfd4070a0800d9ec51cf5fc5aa225 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 23 Feb 2022 12:21:51 -0800 Subject: [PATCH 053/247] Pointer issue fixed. --- src/accl/graph/sega/MPU.py | 4 --- src/accl/graph/sega/WLEngine.py | 3 +- src/accl/graph/sega/apply_engine.cc | 14 +++----- src/accl/graph/sega/apply_engine.hh | 7 ++-- src/accl/graph/sega/mpu.cc | 55 ++++------------------------- src/accl/graph/sega/mpu.hh | 10 +----- src/accl/graph/sega/push_engine.cc | 15 +------- src/accl/graph/sega/push_engine.hh | 5 --- src/accl/graph/sega/wl_engine.cc | 14 +++----- src/accl/graph/sega/wl_engine.hh | 7 ++-- 10 files changed, 23 insertions(+), 111 deletions(-) diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py index efd8dbc11f..71b8841b10 100644 --- a/src/accl/graph/sega/MPU.py +++ b/src/accl/graph/sega/MPU.py @@ -38,12 +38,8 @@ class MPU(ClockedObject): cxx_header = "accl/graph/sega/mpu.hh" cxx_class = 'gem5::MPU' - apply_engine = Param.ApplyEngine(NULL, "ApplyEngine object to connect to " - "This MPU") push_engine = Param.PushEngine(NULL, "PushEngine object to connect to " "This MPU") - work_list_engine = Param.WLEngine(NULL, "WLEngine object to connect to " - "This MPU") respPort = ResponsePort("Port to Receive updates from outside") reqPort = RequestPort("Port to send updates to the outside") diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py index 12fbcf9b4f..3bfe9fa16f 100644 --- a/src/accl/graph/sega/WLEngine.py +++ b/src/accl/graph/sega/WLEngine.py @@ -34,4 +34,5 @@ class WLEngine(BaseWLEngine): cxx_header = "accl/graph/sega/wl_engine.hh" cxx_class = 'gem5::WLEngine' - mpu = Param.MPU(Parent.any, "MPU object that owns this WLEngine") \ No newline at end of file + apply_engine = Param.ApplyEngine(Parent.any, + "MPU object that owns this WLEngine") diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc index bc45850041..0f686e7f8c 100644 --- a/src/accl/graph/sega/apply_engine.cc +++ b/src/accl/graph/sega/apply_engine.cc @@ -27,24 +27,20 @@ */ #include "accl/graph/sega/apply_engine.hh" -#include "accl/graph/sega/mpu.hh" +#include "accl/graph/sega/push_engine.hh" namespace gem5{ ApplyEngine::ApplyEngine(const ApplyEngineParams ¶ms) : BaseApplyEngine(params), - mpu(params.mpu) + pushEngine(params.push_engine) {} bool -ApplyEngine::sendMemReq(PacketPtr pkt){ - return mpu->handleMemReq(pkt); -} +ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) +{ + return push_engine->recvApplyNotif(prop, degree, edgeIndex); -bool -ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){ - mpu->recvApplyNotif(prop, degree, edgeIndex); - return true; } } \ No newline at end of file diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh index c7d3073e36..4d828c6aa1 100644 --- a/src/accl/graph/sega/apply_engine.hh +++ b/src/accl/graph/sega/apply_engine.hh @@ -42,17 +42,14 @@ namespace gem5 { -class MPU; +class PushEngine; class ApplyEngine : public BaseApplyEngine { private: - - MPU* mpu; + PushEngine* pushEngine; protected: - - virtual bool sendMemReq(PacketPtr pkt); virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex); public: diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc index 4824bcd699..23a777d1c6 100644 --- a/src/accl/graph/sega/mpu.cc +++ b/src/accl/graph/sega/mpu.cc @@ -33,12 +33,9 @@ namespace gem5 MPU::MPU(const MPUParams ¶ms): ClockedObject(params), - nextRequestorId(0), respPort(name() + ".respPort", this), reqPort(name() + ".reqPort", this), memPort(name() + ".memPort", this), - applyEngine(params.apply_engine), - pushEngine(params.push_engine), wlEngine(params.work_list_engine) {} @@ -59,16 +56,6 @@ MPU::getPort(const std::string &if_name, PortID idx) void MPU::startup() { - if (((int16_t) applyEngine->getRequestorId()) == -1) { - applyEngine->setRequestorId(nextRequestorId++); - } - if (((int16_t) pushEngine->getRequestorId()) == -1) { - pushEngine->setRequestorId(nextRequestorId++); - } - if (((int16_t) wlEngine->getRequestorId()) == -1) { - wlEngine->setRequestorId(nextRequestorId++); - } - //FIXME: This is the current version of our initializer. // This should be updated in the future. WorkListItem vertices [5] = { @@ -177,9 +164,7 @@ MPU::MPUMemPort::sendPacket(PacketPtr pkt) bool MPU::MPUMemPort::recvTimingResp(PacketPtr pkt) { - //TODO: Investigate sending true all the time - owner->handleMemResp(pkt); - return true; + panic("recvTimingResp called on MPU::MPUMemPort memPort."); } void @@ -224,16 +209,7 @@ MPU::handleMemReq(PacketPtr pkt) void MPU::handleMemResp(PacketPtr pkt) { - RequestorID requestorId = pkt->requestorId(); - if (applyEngine->getRequestorId() == requestorId) { - applyEngine->handleMemResp(pkt); - } else if (pushEngine->getRequestorId() == requestorId) { - pushEngine->handleMemResp(pkt); - } else if (wlEngine->getRequestorId() == requestorId) { - wlEngine->handleMemResp(pkt); - } else { - panic("Received a response with an unknown requestorId."); - } + panic("MPU::handleMemResp called!"); } bool @@ -242,39 +218,20 @@ MPU::handleWLUpdate(PacketPtr pkt) return wlEngine->handleWLUpdate(pkt); } -bool -MPU::recvWLNotif(Addr addr) -{ - return applyEngine->recvWLNotif(addr); -} - -bool -MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index) -{ - return pushEngine->recvApplyNotif(prop, degree, edge_index); -} - bool MPU::recvPushUpdate(PacketPtr pkt) { Addr addr = pkt->getAddr(); for (auto addr_range: memPort.getAddrRanges()) { if (addr_range.contains(addr)) { - if (memPort.blocked()) { - return false; - } else { - memPort.sendPacket(pkt); - return true; - } + return handleWLUpdate(pkt); } } - - if (reqPort.blocked()) { - return false; + if (!reqPort.blocked()) { + reqPort.sendPacket(pkt); + return true; } - reqPort.sendPacket(pkt); return true; - } } diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index 8b5ba20b1c..2df8993749 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -103,18 +103,13 @@ class MPU : public ClockedObject virtual void recvReqRetry(); }; - virtual void startup(); - - RequestorID nextRequestorId; - MPURespPort respPort; MPUReqPort reqPort; MPUMemPort memPort; - ApplyEngine* applyEngine; - PushEngine* pushEngine; WLEngine* wlEngine; + virtual void startup(); AddrRangeList getAddrRanges(); void recvFunctional(PacketPtr pkt); @@ -125,9 +120,6 @@ class MPU : public ClockedObject Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; - bool handleMemReq(PacketPtr pkt); - void handleMemResp(PacketPtr pkt); - bool handleWLUpdate(PacketPtr pkt); bool recvPushUpdate(PacketPtr pkt); }; diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 922ae32ed2..71cb2955fd 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -35,20 +35,7 @@ namespace gem5 PushEngine::PushEngine(const PushEngineParams ¶ms) : BasePushEngine(params), mpu(params.mpu) -{ -} - -Port & -PushEngine::getPort(const std::string &if_name, PortID idx) -{ - return SimObject::getPort(if_name, idx); -} - -bool -PushEngine::sendMemReq(PacketPtr pkt) -{ - return mpu->handleMemReq(pkt); -} +{} bool PushEngine::sendPushUpdate(PacketPtr pkt) diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 1a800e58f3..7b3474d2ec 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -43,16 +43,11 @@ class PushEngine : public BasePushEngine MPU* mpu; protected: - virtual bool sendMemReq(PacketPtr pkt); virtual bool sendPushUpdate(PacketPtr pkt); public: PARAMS(PushEngine); PushEngine(const PushEngineParams ¶ms); - - Port& getPort(const std::string &if_name, - PortID idx=InvalidPortID) override; - }; } diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 40ec755969..3d9d7af0c6 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -27,25 +27,19 @@ */ #include "accl/graph/sega/wl_engine.hh" -#include "accl/graph/sega/mpu.hh" +#include "accl/graph/sega/apply_engine.hh" + namespace gem5 { WLEngine::WLEngine(const WLEngineParams ¶ms): BaseWLEngine(params), - mpu(params.mpu) + applyEngine(params.apply_engine) {} -bool -WLEngine::sendMemReq(PacketPtr pkt){ - return mpu->handleMemReq(pkt); -} - -// FIXME: handle the case where Apply queue is full bool WLEngine::sendWLNotif(Addr addr){ - mpu->recvWLNotif(addr); - return true; + apply_engine->recvWLNotif(addr); } } diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 238ffbe724..c154867b0d 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -45,17 +45,14 @@ namespace gem5 { -// class MPU; +class ApplyEngine; class WLEngine : public BaseWLEngine { private: - - MPU* mpu; + ApplyEngine* applyEngine; protected: - - virtual bool sendMemReq(PacketPtr pkt); virtual bool sendWLNotif(Addr addr); public: From 39883a68c9f8c2895ce9c0a5315dd9cf4eec7a9c Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 23 Feb 2022 12:24:07 -0800 Subject: [PATCH 054/247] Adding BaseEngine to SConscript. --- src/accl/graph/base/SConscript | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript index cc55100064..41c48fc419 100644 --- a/src/accl/graph/base/SConscript +++ b/src/accl/graph/base/SConscript @@ -28,10 +28,12 @@ Import('*') SimObject('BaseApplyEngine.py') +SimObject('BaseEngine.py') SimObject('BasePushEngine.py') SimObject('BaseWLEngine.py') Source('base_apply_engine.cc') +Source('base_engine.cc') Source('base_push_engine.cc') Source('base_wl_engine.cc') Source('util.cc') From adfa21a1a8b9ee69b7e75dab14e8db2f1be7e2ca Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 23 Feb 2022 12:43:55 -0800 Subject: [PATCH 055/247] Compilation issues fixed. Still linking issues. --- src/accl/graph/base/BaseEngine.py | 1 + src/accl/graph/base/base_apply_engine.cc | 3 +-- src/accl/graph/base/base_engine.cc | 6 +++--- src/accl/graph/base/base_engine.hh | 14 +++++++------- src/accl/graph/base/base_push_engine.cc | 16 +++++----------- src/accl/graph/base/base_wl_engine.cc | 10 +++++----- src/accl/graph/sega/MPU.py | 8 ++------ src/accl/graph/sega/apply_engine.cc | 3 +-- src/accl/graph/sega/apply_engine.hh | 2 +- src/accl/graph/sega/mpu.cc | 14 -------------- src/accl/graph/sega/mpu.hh | 2 -- src/accl/graph/sega/push_engine.cc | 1 - src/accl/graph/sega/push_engine.hh | 1 + src/accl/graph/sega/wl_engine.cc | 3 +-- src/accl/graph/sega/wl_engine.hh | 1 + 15 files changed, 29 insertions(+), 56 deletions(-) diff --git a/src/accl/graph/base/BaseEngine.py b/src/accl/graph/base/BaseEngine.py index 3eb5f0cbbc..367df8dbc1 100644 --- a/src/accl/graph/base/BaseEngine.py +++ b/src/accl/graph/base/BaseEngine.py @@ -35,4 +35,5 @@ class BaseEngine(ClockedObject): cxx_header = "accl/graph/base/base_engine.hh" cxx_class = 'gem5::BaseEngine' + system = Param.System(Parent.any, 'System this Engine is a part of') memPort = RequestPort("Port to communicate with the memory") diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc index 842481c2d1..b7f3030e00 100644 --- a/src/accl/graph/base/base_apply_engine.cc +++ b/src/accl/graph/base/base_apply_engine.cc @@ -37,8 +37,7 @@ namespace gem5 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams ¶ms): BaseEngine(params), - nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()), - nextApplyEvent([this]{ processNextApplyEvent(); }, name()) + nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()) {} bool diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc index 6a50e1630e..06827c1d4e 100644 --- a/src/accl/graph/base/base_engine.cc +++ b/src/accl/graph/base/base_engine.cc @@ -34,8 +34,8 @@ namespace gem5 BaseEngine::BaseEngine(const BaseEngineParams ¶ms) : ClockedObject(params), system(params.system), - requestorId(system->getRequestorId()), memPort(name() + ".memPort", this), + requestorId(system->getRequestorId(this)), nextMemRespEvent([this] { processNextMemRespEvent(); }, name()) {} @@ -77,8 +77,8 @@ bool BaseEngine::handleMemResp(PacketPtr pkt) { memRespQueue.push(pkt); - if (!nextMemResponseEvent.scheduled() && !memRespQueue.empty()) { - schedule(nextMemResponseEvent, nextCycle()); + if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()) { + schedule(nextMemRespEvent, nextCycle()); } return true; } diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh index 4f5a29676d..057a4c6d91 100644 --- a/src/accl/graph/base/base_engine.hh +++ b/src/accl/graph/base/base_engine.hh @@ -68,25 +68,25 @@ class BaseEngine : public ClockedObject }; System* system; - const RequestorID requestorId; MemPort memPort; bool handleMemResp(PacketPtr resp); - EventFunctionWrapper nextMemRespEvent; protected: - bool memPortBlocked() { return memPort.blocked(); } - void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); } - + const RequestorID requestorId; // TODO: Add this later, maybe? // int memRespQueueSize; std::queue memRespQueue; - /* Respective function for nextMemRespEvent. - All the classes inheriting from this class will + + bool memPortBlocked() { return memPort.blocked(); } + void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); } + + /* All the classes inheriting from this class will do their main processing in this function. For example, BaseWLEngine reduces the temp_pro with the value of update in this function. */ + EventFunctionWrapper nextMemRespEvent; virtual void processNextMemRespEvent() = 0; public: diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc index 4c43f95939..187eefe01b 100644 --- a/src/accl/graph/base/base_push_engine.cc +++ b/src/accl/graph/base/base_push_engine.cc @@ -34,23 +34,17 @@ namespace gem5 { BasePushEngine::BasePushEngine(const BasePushEngineParams ¶ms) : - ClockedObject(params), - // vertexQueueSize(params.vertex_queue_size), - // vertexQueueLen(0), - // updateQueue(params.update_queue_size), - // updateQueueLen(0), - nextReceiveEvent([this] { processNextReceiveEvent(); }, name()), - nextSendEvent([this] { processNextSendEvent(); }, name()) -{ -} + BaseEngine(params), + nextReadEvent([this] { processNextReadEvent(); }, name()) +{} bool BasePushEngine::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index) { notifQueue.emplace(prop, degree, edge_index); - if (!nextReceiveEvent.scheduled()) { - schedule(nextReceiveEvent, nextCycle()); + if (!nextReadEvent.scheduled()) { + schedule(nextReadEvent, nextCycle()); } return true; } diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc index 5d84e34ccd..20abaa7b20 100644 --- a/src/accl/graph/base/base_wl_engine.cc +++ b/src/accl/graph/base/base_wl_engine.cc @@ -52,7 +52,7 @@ BaseWLEngine::handleWLUpdate(PacketPtr pkt) void BaseWLEngine::processNextWLReadEvent() { PacketPtr pkt = updateQueue.front(); - uint32_t data = *(pkt->getPtr()); + uint32_t value = *(pkt->getPtr()); Addr addr = pkt->getAddr(); Addr req_addr = (addr / 64) * 64; @@ -66,7 +66,7 @@ void BaseWLEngine::processNextWLReadEvent() sendMemReq(memPkt); updateQueue.pop(); } - if (!queue.empty() && !nextWLReadEvent.scheduled()) { + if (!nextWLReadEvent.scheduled() && !updateQueue.empty()) { schedule(nextWLReadEvent, nextCycle()); } } @@ -78,7 +78,7 @@ BaseWLEngine::processNextMemRespEvent() uint8_t* respData = resp->getPtr(); Addr request_offset = requestOffsetMap[resp->req]; uint32_t value = requestValueMap[resp->req]; - WorkListItem wl = memoryToWorkList(data + request_offset); + WorkListItem wl = memoryToWorkList(respData + request_offset); if (value < wl.temp_prop){ //update prop with temp_prop @@ -87,10 +87,10 @@ BaseWLEngine::processNextMemRespEvent() uint8_t* wlData = workListToMemory(wl); memcpy(respData + request_offset, wlData, sizeof(WorkListItem)); PacketPtr writePkt = - getWritePacket(pkt->getAddr(), 64, respData, requestorId); + getWritePacket(resp->getAddr(), 64, respData, requestorId); if (!memPortBlocked()) { - if (sendWLNotif(pkt->getAddr() + request_offset)) { + if (sendWLNotif(resp->getAddr() + request_offset)) { sendMemReq(writePkt); memRespQueue.pop(); // TODO: Erase map entries, delete wlData; diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py index 71b8841b10..87de0fb7d6 100644 --- a/src/accl/graph/sega/MPU.py +++ b/src/accl/graph/sega/MPU.py @@ -28,18 +28,14 @@ from m5.params import * from m5.proxy import * from m5.objects.ClockedObject import ClockedObject - -# from m5.objects.WLEngine import WLEngine -# from m5.objects.PushEngine import PushEngine -# from m5.objects.ApplyEngine import ApplyEngine +from m5.objects.WLEngine import WLEngine class MPU(ClockedObject): type = 'MPU' cxx_header = "accl/graph/sega/mpu.hh" cxx_class = 'gem5::MPU' - push_engine = Param.PushEngine(NULL, "PushEngine object to connect to " - "This MPU") + work_list_engine = Param.WLEngine(NULL, "WLEngine to connect to this MPU") respPort = ResponsePort("Port to Receive updates from outside") reqPort = RequestPort("Port to send updates to the outside") diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc index 0f686e7f8c..bc3d703cf6 100644 --- a/src/accl/graph/sega/apply_engine.cc +++ b/src/accl/graph/sega/apply_engine.cc @@ -27,7 +27,6 @@ */ #include "accl/graph/sega/apply_engine.hh" -#include "accl/graph/sega/push_engine.hh" namespace gem5{ @@ -39,7 +38,7 @@ ApplyEngine::ApplyEngine(const ApplyEngineParams ¶ms) : bool ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) { - return push_engine->recvApplyNotif(prop, degree, edgeIndex); + return pushEngine->recvApplyNotif(prop, degree, edgeIndex); } diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh index 4d828c6aa1..aff2c5417b 100644 --- a/src/accl/graph/sega/apply_engine.hh +++ b/src/accl/graph/sega/apply_engine.hh @@ -33,6 +33,7 @@ #include #include "accl/graph/base/base_apply_engine.hh" +#include "accl/graph/sega/push_engine.hh" #include "mem/packet.hh" #include "mem/port.hh" #include "params/ApplyEngine.hh" @@ -42,7 +43,6 @@ namespace gem5 { -class PushEngine; class ApplyEngine : public BaseApplyEngine { diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc index 23a777d1c6..9bda696cb5 100644 --- a/src/accl/graph/sega/mpu.cc +++ b/src/accl/graph/sega/mpu.cc @@ -198,20 +198,6 @@ MPU::recvFunctional(PacketPtr pkt) } } -bool -MPU::handleMemReq(PacketPtr pkt) -{ - //TODO: Investigate sending true all the time - memPort.sendPacket(pkt); - return true; -} - -void -MPU::handleMemResp(PacketPtr pkt) -{ - panic("MPU::handleMemResp called!"); -} - bool MPU::handleWLUpdate(PacketPtr pkt) { diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index 2df8993749..a0472eead5 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -29,8 +29,6 @@ #ifndef __ACCL_GRAPH_SEGA_MPU_HH__ #define __ACCL_GRAPH_SEGA_MPU_HH__ -#include "accl/graph/sega/apply_engine.hh" -#include "accl/graph/sega/push_engine.hh" #include "accl/graph/sega/wl_engine.hh" #include "base/addr_range.hh" #include "mem/port.hh" diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 71cb2955fd..a1fa86da2b 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -27,7 +27,6 @@ */ #include "accl/graph/sega/push_engine.hh" -#include "accl/graph/sega/mpu.hh" namespace gem5 { diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 7b3474d2ec..edf698011d 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -30,6 +30,7 @@ #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ #include "accl/graph/base/base_push_engine.hh" +#include "accl/graph/sega/mpu.hh" #include "params/PushEngine.hh" namespace gem5 diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 3d9d7af0c6..823aa49bb9 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -27,7 +27,6 @@ */ #include "accl/graph/sega/wl_engine.hh" -#include "accl/graph/sega/apply_engine.hh" namespace gem5 { @@ -39,7 +38,7 @@ WLEngine::WLEngine(const WLEngineParams ¶ms): bool WLEngine::sendWLNotif(Addr addr){ - apply_engine->recvWLNotif(addr); + return applyEngine->recvWLNotif(addr); } } diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index c154867b0d..6946713aaa 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -33,6 +33,7 @@ #include #include "accl/graph/base/base_wl_engine.hh" +#include "accl/graph/sega/apply_engine.hh" #include "base/addr_range.hh" #include "mem/port.hh" #include "mem/packet.hh" From 05771a071f7016fe66fc0da8e551ef793ac0c059 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 23 Feb 2022 13:19:29 -0800 Subject: [PATCH 056/247] Removing unnecessary includes. --- src/accl/graph/base/base_apply_engine.cc | 4 ++-- src/accl/graph/base/base_apply_engine.hh | 5 +---- src/accl/graph/base/base_engine.hh | 4 +--- src/accl/graph/base/base_push_engine.hh | 4 +--- src/accl/graph/base/base_wl_engine.hh | 8 -------- 5 files changed, 5 insertions(+), 20 deletions(-) diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc index b7f3030e00..009c01ccb7 100644 --- a/src/accl/graph/base/base_apply_engine.cc +++ b/src/accl/graph/base/base_apply_engine.cc @@ -59,7 +59,7 @@ BaseApplyEngine::processNextApplyCheckEvent() // or change the clock domain for this simobject. Addr addr = applyReadQueue.front(); Addr req_addr = (addr / 64) * 64; - int req_offset = (addr % 64); + Addr req_offset = (addr % 64); RequestPtr request = std::make_shared(req_addr, 64, 0 ,0); PacketPtr memPkt = new Packet(request, MemCmd::ReadReq); requestOffset[request] = req_offset; @@ -79,7 +79,7 @@ BaseApplyEngine::processNextMemRespEvent() uint8_t* data = pkt->getPtr(); RequestPtr request = pkt->req; - int request_offset = requestOffset[request]; + Addr request_offset = requestOffset[request]; WorkListItem wl = memoryToWorkList(data + request_offset); // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh index 02646a74ff..e3fe47d923 100644 --- a/src/accl/graph/base/base_apply_engine.hh +++ b/src/accl/graph/base/base_apply_engine.hh @@ -33,11 +33,8 @@ #include #include "accl/graph/base/base_engine.hh" -#include "mem/packet.hh" -#include "mem/port.hh" #include "mem/request.hh" #include "params/BaseApplyEngine.hh" -#include "sim/port.hh" namespace gem5 { @@ -47,7 +44,7 @@ class BaseApplyEngine : public BaseEngine private: std::queue applyReadQueue; - std::unordered_map requestOffset; + std::unordered_map requestOffset; EventFunctionWrapper nextApplyCheckEvent; void processNextApplyCheckEvent(); diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh index 057a4c6d91..b0b05d9477 100644 --- a/src/accl/graph/base/base_engine.hh +++ b/src/accl/graph/base/base_engine.hh @@ -34,10 +34,8 @@ #include "mem/packet.hh" #include "mem/port.hh" -#include "mem/request.hh" #include "params/BaseEngine.hh" #include "sim/clocked_object.hh" -#include "sim/port.hh" #include "sim/system.hh" namespace gem5 @@ -79,7 +77,7 @@ class BaseEngine : public ClockedObject std::queue memRespQueue; bool memPortBlocked() { return memPort.blocked(); } - void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); } + void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); } /* All the classes inheriting from this class will do their main processing in this function. For diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh index 5a6ef85b0f..0da4241dfd 100644 --- a/src/accl/graph/base/base_push_engine.hh +++ b/src/accl/graph/base/base_push_engine.hh @@ -32,9 +32,7 @@ #include #include "accl/graph/base/base_engine.hh" -#include "mem/port.hh" #include "mem/request.hh" -#include "mem/packet.hh" #include "params/BasePushEngine.hh" namespace gem5 @@ -54,7 +52,7 @@ class BasePushEngine : public BaseEngine }; std::queue notifQueue; - // int vertexQueueSize; + // int notifQueueSize; std::unordered_map reqOffsetMap; std::unordered_map reqNumEdgeMap; diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh index ab8952de41..3ca9a146a1 100644 --- a/src/accl/graph/base/base_wl_engine.hh +++ b/src/accl/graph/base/base_wl_engine.hh @@ -34,12 +34,7 @@ #include "accl/graph/base/base_engine.hh" #include "accl/graph/base/util.hh" -#include "base/addr_range.hh" -#include "mem/port.hh" -#include "mem/packet.hh" #include "params/BaseWLEngine.hh" -#include "sim/port.hh" -#include "sim/system.hh" namespace gem5 { @@ -78,9 +73,6 @@ class BaseWLEngine : public BaseEngine BaseWLEngine(const BaseWLEngineParams ¶ms); - Port& getPort(const std::string &if_name, - PortID idx=InvalidPortID) override; - bool handleWLUpdate(PacketPtr pkt); }; From 01b4b2a5a80247c969243bbb52bbbe9bd4ef41f8 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 23 Feb 2022 13:51:30 -0800 Subject: [PATCH 057/247] Fixing the issue of calling pure virtual function. --- src/accl/graph/base/base_apply_engine.cc | 17 +++++++++++++---- src/accl/graph/base/base_apply_engine.hh | 6 +++++- src/accl/graph/base/base_engine.cc | 7 ++----- src/accl/graph/base/base_engine.hh | 8 +------- src/accl/graph/base/base_push_engine.cc | 17 +++++++++++++---- src/accl/graph/base/base_push_engine.hh | 5 ++++- src/accl/graph/base/base_wl_engine.cc | 13 +++++++++++-- src/accl/graph/base/base_wl_engine.hh | 2 +- 8 files changed, 50 insertions(+), 25 deletions(-) diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc index 009c01ccb7..e7b7dd6a22 100644 --- a/src/accl/graph/base/base_apply_engine.cc +++ b/src/accl/graph/base/base_apply_engine.cc @@ -37,7 +37,8 @@ namespace gem5 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams ¶ms): BaseEngine(params), - nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()) + nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()), + nextApplyEvent([this]{ processNextApplyEvent(); }, name()) {} bool @@ -73,7 +74,7 @@ BaseApplyEngine::processNextApplyCheckEvent() } void -BaseApplyEngine::processNextMemRespEvent() +BaseApplyEngine::processNextApplyEvent() { PacketPtr pkt = memRespQueue.front(); uint8_t* data = pkt->getPtr(); @@ -105,8 +106,16 @@ BaseApplyEngine::processNextMemRespEvent() } else { memRespQueue.pop(); } - if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()){ - schedule(nextMemRespEvent, nextCycle()); + if (!nextApplyEvent.scheduled() && !memRespQueue.empty()){ + schedule(nextApplyEvent, nextCycle()); + } +} + +void +BaseApplyEngine::scheduleMainEvent() +{ + if (!memRespQueue.empty() && !nextApplyEvent.scheduled()) { + schedule(nextApplyEvent, nextCycle()); } } diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh index e3fe47d923..486fb687fe 100644 --- a/src/accl/graph/base/base_apply_engine.hh +++ b/src/accl/graph/base/base_apply_engine.hh @@ -49,10 +49,14 @@ class BaseApplyEngine : public BaseEngine EventFunctionWrapper nextApplyCheckEvent; void processNextApplyCheckEvent(); + EventFunctionWrapper nextApplyEvent; + void processNextApplyEvent(); + protected: virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0; - virtual void processNextMemRespEvent(); + + virtual void scheduleMainEvent(); public: PARAMS(BaseApplyEngine); diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc index 06827c1d4e..245192643c 100644 --- a/src/accl/graph/base/base_engine.cc +++ b/src/accl/graph/base/base_engine.cc @@ -35,8 +35,7 @@ BaseEngine::BaseEngine(const BaseEngineParams ¶ms) : ClockedObject(params), system(params.system), memPort(name() + ".memPort", this), - requestorId(system->getRequestorId(this)), - nextMemRespEvent([this] { processNextMemRespEvent(); }, name()) + requestorId(system->getRequestorId(this)) {} @@ -77,9 +76,7 @@ bool BaseEngine::handleMemResp(PacketPtr pkt) { memRespQueue.push(pkt); - if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()) { - schedule(nextMemRespEvent, nextCycle()); - } + scheduleMainEvent(); return true; } diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh index b0b05d9477..3436229aa1 100644 --- a/src/accl/graph/base/base_engine.hh +++ b/src/accl/graph/base/base_engine.hh @@ -79,13 +79,7 @@ class BaseEngine : public ClockedObject bool memPortBlocked() { return memPort.blocked(); } void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); } - /* All the classes inheriting from this class will - do their main processing in this function. For - example, BaseWLEngine reduces the temp_pro with - the value of update in this function. - */ - EventFunctionWrapper nextMemRespEvent; - virtual void processNextMemRespEvent() = 0; + virtual void scheduleMainEvent() = 0; public: PARAMS(BaseEngine); diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc index 187eefe01b..a963cc9709 100644 --- a/src/accl/graph/base/base_push_engine.cc +++ b/src/accl/graph/base/base_push_engine.cc @@ -35,7 +35,8 @@ namespace gem5 BasePushEngine::BasePushEngine(const BasePushEngineParams ¶ms) : BaseEngine(params), - nextReadEvent([this] { processNextReadEvent(); }, name()) + nextReadEvent([this] { processNextReadEvent(); }, name()), + nextPushEvent([this] { processNextPushEvent(); }, name()) {} bool @@ -97,7 +98,7 @@ BasePushEngine::processNextReadEvent() } void -BasePushEngine::processNextMemRespEvent() +BasePushEngine::processNextPushEvent() { PacketPtr pkt = memRespQueue.front(); RequestPtr req = pkt->req; @@ -124,8 +125,16 @@ BasePushEngine::processNextMemRespEvent() } } - if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()) { - schedule(nextMemRespEvent, nextCycle()); + if (!nextPushEvent.scheduled() && !memRespQueue.empty()) { + schedule(nextPushEvent, nextCycle()); + } +} + +void +BasePushEngine::scheduleMainEvent() +{ + if (!memRespQueue.empty() && !nextPushEvent.scheduled()) { + schedule(nextPushEvent, nextCycle()); } } diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh index 0da4241dfd..8bb7d6663a 100644 --- a/src/accl/graph/base/base_push_engine.hh +++ b/src/accl/graph/base/base_push_engine.hh @@ -61,9 +61,12 @@ class BasePushEngine : public BaseEngine EventFunctionWrapper nextReadEvent; void processNextReadEvent(); + EventFunctionWrapper nextPushEvent; + void processNextPushEvent(); + protected: virtual bool sendPushUpdate(PacketPtr pkt) = 0; - virtual void processNextMemRespEvent(); + virtual void scheduleMainEvent(); public: diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc index 20abaa7b20..ef66603de7 100644 --- a/src/accl/graph/base/base_wl_engine.cc +++ b/src/accl/graph/base/base_wl_engine.cc @@ -72,7 +72,7 @@ void BaseWLEngine::processNextWLReadEvent() } void -BaseWLEngine::processNextMemRespEvent() +BaseWLEngine::processNextWLReduceEvent() { PacketPtr resp = memRespQueue.front(); uint8_t* respData = resp->getPtr(); @@ -100,9 +100,18 @@ BaseWLEngine::processNextMemRespEvent() else { memRespQueue.pop(); } - if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()){ + if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){ schedule(nextWLReduceEvent, nextCycle()); } } +void +BaseWLEngine::scheduleMainEvent() +{ + if (!memRespQueue.empty() && !nextWLReduceEvent.scheduled()) { + schedule(nextWLReduceEvent, nextCycle()); + } +} + + } diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh index 3ca9a146a1..a5070f0b26 100644 --- a/src/accl/graph/base/base_wl_engine.hh +++ b/src/accl/graph/base/base_wl_engine.hh @@ -65,7 +65,7 @@ class BaseWLEngine : public BaseEngine */ protected: virtual bool sendWLNotif(Addr addr) = 0; - virtual void processNextMemRespEvent(); + virtual void scheduleMainEvent(); public: From 235746cdf270f617df2c556e3a676d7f4d02b355 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 23 Feb 2022 17:33:02 -0800 Subject: [PATCH 058/247] Fixed cycle in hierarchy and config. Sim starts. --- configs/accl/sega.py | 40 ++-- src/accl/graph/base/BaseEngine.py | 2 +- src/accl/graph/base/base_apply_engine.hh | 5 +- src/accl/graph/base/base_engine.cc | 12 ++ src/accl/graph/base/base_engine.hh | 5 +- src/accl/graph/base/base_push_engine.hh | 2 +- src/accl/graph/base/base_wl_engine.hh | 2 +- src/accl/graph/sega/MPU.py | 42 ----- src/accl/graph/sega/PushEngine.py | 2 +- src/accl/graph/sega/SConscript | 2 - src/accl/graph/sega/WLEngine.py | 1 + src/accl/graph/sega/apply_engine.hh | 3 +- src/accl/graph/sega/mpu.cc | 223 ----------------------- src/accl/graph/sega/mpu.hh | 127 ------------- src/accl/graph/sega/push_engine.cc | 49 ++++- src/accl/graph/sega/push_engine.hh | 27 ++- src/accl/graph/sega/wl_engine.cc | 88 +++++++++ src/accl/graph/sega/wl_engine.hh | 34 +++- 18 files changed, 238 insertions(+), 428 deletions(-) delete mode 100644 src/accl/graph/sega/MPU.py delete mode 100644 src/accl/graph/sega/mpu.cc delete mode 100644 src/accl/graph/sega/mpu.hh diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 288b1211e4..ea158ecdc9 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -1,28 +1,46 @@ import m5 from m5.objects import * -class PyMPU(MPU): - def __init__(self, clk_domain): - super().__init__() - self.clk_domain = clk_domain - self.apply_engine = ApplyEngine() +class MPU(SubSystem): + def __init__(self): + super(MPU, self).__init__() self.push_engine = PushEngine() - self.wl_engine = WLEngine() + self.apply_engine = ApplyEngine(push_engine = self.push_engine) + self.wl_engine = WLEngine(apply_engine = self.apply_engine) + self.interconnect = SystemXBar() -class SEGA(System): + self.interconnect.cpu_side_ports = self.wl_engine.mem_port + self.interconnect.cpu_side_ports = self.apply_engine.mem_port + self.interconnect.cpu_side_ports = self.push_engine.mem_port + + def getRespPort(self): + return self.wl_engine.resp_port + def setRespPort(self, port): + self.wl_engine.resp_port = port + + def getReqPort(self): + return self.push_engine.req_port + def setReqPort(self, port): + self.push_engine.req_port = port + def getMemPort(self): + return self.interconnect.mem_side_ports + def setMemPort(self, port): + self.interconnect.mem_side_ports = port + +class SEGA(System): def __init__(self): super(SEGA, self).__init__() - # Set up the clock domain and the voltage domain + self.clk_domain = SrcClockDomain() self.clk_domain.clock = '2GHz' self.clk_domain.voltage_domain = VoltageDomain() - self.mpu = PyMPU(self.clk_domain) + self.mpu = MPU() self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB"))) - self.mpu.memPort = self.mem_ctrl.port - self.mpu.reqPort = self.mpu.respPort + self.mpu.setReqPort(self.mpu.getRespPort()) + self.mpu.setMemPort(self.mem_ctrl.port) system = SEGA() root = Root(full_system = False, system = system) diff --git a/src/accl/graph/base/BaseEngine.py b/src/accl/graph/base/BaseEngine.py index 367df8dbc1..16c2f402e5 100644 --- a/src/accl/graph/base/BaseEngine.py +++ b/src/accl/graph/base/BaseEngine.py @@ -36,4 +36,4 @@ class BaseEngine(ClockedObject): cxx_class = 'gem5::BaseEngine' system = Param.System(Parent.any, 'System this Engine is a part of') - memPort = RequestPort("Port to communicate with the memory") + mem_port = RequestPort("Port to communicate with the memory") diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh index 486fb687fe..9111bd074b 100644 --- a/src/accl/graph/base/base_apply_engine.hh +++ b/src/accl/graph/base/base_apply_engine.hh @@ -56,16 +56,13 @@ class BaseApplyEngine : public BaseEngine virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0; - virtual void scheduleMainEvent(); + virtual void scheduleMainEvent() override; public: PARAMS(BaseApplyEngine); BaseApplyEngine(const BaseApplyEngineParams &apply); - Port& getPort(const std::string &if_name, - PortID idx=InvalidPortID) override; - bool recvWLNotif(Addr addr); }; diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc index 245192643c..6b40ba4137 100644 --- a/src/accl/graph/base/base_engine.cc +++ b/src/accl/graph/base/base_engine.cc @@ -38,6 +38,18 @@ BaseEngine::BaseEngine(const BaseEngineParams ¶ms) : requestorId(system->getRequestorId(this)) {} +BaseEngine::~BaseEngine() +{} + +Port& +BaseEngine::getPort(const std::string &if_name, PortID idx) +{ + if (if_name == "mem_port") { + return memPort; + } else { + return SimObject::getPort(if_name, idx); + } +} void BaseEngine::MemPort::sendPacket(PacketPtr pkt) diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh index 3436229aa1..53415ddc7c 100644 --- a/src/accl/graph/base/base_engine.hh +++ b/src/accl/graph/base/base_engine.hh @@ -32,6 +32,7 @@ #include #include +#include "base/addr_range.hh" #include "mem/packet.hh" #include "mem/port.hh" #include "params/BaseEngine.hh" @@ -78,6 +79,8 @@ class BaseEngine : public ClockedObject bool memPortBlocked() { return memPort.blocked(); } void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); } + void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } + AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); } virtual void scheduleMainEvent() = 0; @@ -85,7 +88,7 @@ class BaseEngine : public ClockedObject PARAMS(BaseEngine); BaseEngine(const BaseEngineParams ¶ms); - + ~BaseEngine(); Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; }; diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh index 8bb7d6663a..01027d2791 100644 --- a/src/accl/graph/base/base_push_engine.hh +++ b/src/accl/graph/base/base_push_engine.hh @@ -66,7 +66,7 @@ class BasePushEngine : public BaseEngine protected: virtual bool sendPushUpdate(PacketPtr pkt) = 0; - virtual void scheduleMainEvent(); + virtual void scheduleMainEvent() override; public: diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh index a5070f0b26..38079f8f94 100644 --- a/src/accl/graph/base/base_wl_engine.hh +++ b/src/accl/graph/base/base_wl_engine.hh @@ -65,7 +65,7 @@ class BaseWLEngine : public BaseEngine */ protected: virtual bool sendWLNotif(Addr addr) = 0; - virtual void scheduleMainEvent(); + virtual void scheduleMainEvent() override; public: diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py deleted file mode 100644 index 87de0fb7d6..0000000000 --- a/src/accl/graph/sega/MPU.py +++ /dev/null @@ -1,42 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (c) 2017 Jason Lowe-Power -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from m5.params import * -from m5.proxy import * -from m5.objects.ClockedObject import ClockedObject -from m5.objects.WLEngine import WLEngine - -class MPU(ClockedObject): - type = 'MPU' - cxx_header = "accl/graph/sega/mpu.hh" - cxx_class = 'gem5::MPU' - - work_list_engine = Param.WLEngine(NULL, "WLEngine to connect to this MPU") - - respPort = ResponsePort("Port to Receive updates from outside") - reqPort = RequestPort("Port to send updates to the outside") - memPort = RequestPort("Port to communicate with the memory") diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py index eb0eed18ab..a743b57262 100644 --- a/src/accl/graph/sega/PushEngine.py +++ b/src/accl/graph/sega/PushEngine.py @@ -34,4 +34,4 @@ class PushEngine(BasePushEngine): cxx_header = "accl/graph/sega/push_engine.hh" cxx_class = 'gem5::PushEngine' - mpu = Param.MPU(Parent.any, "MPU object that owns this PushEngine") + req_port = RequestPort("Port to send updates to the outside") diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index dc19ece06b..f20d0e44df 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -28,11 +28,9 @@ Import('*') SimObject('ApplyEngine.py') -SimObject('MPU.py') SimObject('PushEngine.py') SimObject('WLEngine.py') Source('apply_engine.cc') -Source('mpu.cc') Source('push_engine.cc') Source('wl_engine.cc') diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py index 3bfe9fa16f..2d650ecb92 100644 --- a/src/accl/graph/sega/WLEngine.py +++ b/src/accl/graph/sega/WLEngine.py @@ -34,5 +34,6 @@ class WLEngine(BaseWLEngine): cxx_header = "accl/graph/sega/wl_engine.hh" cxx_class = 'gem5::WLEngine' + resp_port = ResponsePort("Port to Receive updates from outside") apply_engine = Param.ApplyEngine(Parent.any, "MPU object that owns this WLEngine") diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh index aff2c5417b..1190786e36 100644 --- a/src/accl/graph/sega/apply_engine.hh +++ b/src/accl/graph/sega/apply_engine.hh @@ -50,7 +50,8 @@ class ApplyEngine : public BaseApplyEngine PushEngine* pushEngine; protected: - virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex); + virtual bool sendApplyNotif(uint32_t prop, + uint32_t degree, uint32_t edgeIndex) override; public: PARAMS(ApplyEngine); diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc deleted file mode 100644 index 9bda696cb5..0000000000 --- a/src/accl/graph/sega/mpu.cc +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "accl/graph/sega/mpu.hh" - -namespace gem5 -{ - -MPU::MPU(const MPUParams ¶ms): - ClockedObject(params), - respPort(name() + ".respPort", this), - reqPort(name() + ".reqPort", this), - memPort(name() + ".memPort", this), - wlEngine(params.work_list_engine) -{} - -Port& -MPU::getPort(const std::string &if_name, PortID idx) -{ - if (if_name == "respPort") { - return respPort; - } else if (if_name == "reqPort") { - return reqPort; - } else if (if_name == "memPort") { - return memPort; - } else { - return SimObject::getPort(if_name, idx); - } -} - -void -MPU::startup() -{ - //FIXME: This is the current version of our initializer. - // This should be updated in the future. - WorkListItem vertices [5] = { - {0, 0, 3, 0}, // Addr: 0 - {0, 0, 1, 3}, // Addr: 16 - {0, 0, 1, 4}, // Addr: 32 - {0, 0, 0, 5}, // Addr: 48 - {0, 0, 0, 5} // Addr: 64 - }; - Edge edges [6] = { - {0, 16}, // Addr: 1048576 - {0, 32}, // Addr: 1048592 - {0, 48}, // Addr: 1048608 - {0, 32}, // Addr: 1048624 - {0, 64} // Addr: 1048640 - }; - - for (int i = 0; i < 5; i++) { - uint8_t* data = workListToMemory(vertices[i]); - PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem), - 16, data, 0); - memPort.sendFunctional(pkt); - } - - for (int i = 0; i < 6; i++) { - uint8_t* data = edgeToMemory(edges[i]); - PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge), - 16, data, 0); - memPort.sendFunctional(pkt); - } -} - -AddrRangeList -MPU::MPURespPort::getAddrRanges() const -{ - return owner->getAddrRanges(); -} - -bool -MPU::MPURespPort::recvTimingReq(PacketPtr pkt) -{ - return owner->handleWLUpdate(pkt); -} - -Tick -MPU::MPURespPort::recvAtomic(PacketPtr pkt) -{ - panic("recvAtomic unimpl."); -} - -void -MPU::MPURespPort::recvFunctional(PacketPtr pkt) -{ - owner->recvFunctional(pkt); -} - -void -MPU::MPURespPort::recvRespRetry() -{ - panic("recvRespRetry from response port is called."); -} - -void -MPU::MPUReqPort::sendPacket(PacketPtr pkt) -{ - panic_if(_blocked, "Should never try to send if blocked MemSide!"); - // If we can't send the packet across the port, store it for later. - if (!sendTimingReq(pkt)) - { - blockedPacket = pkt; - _blocked = true; - } -} - -bool -MPU::MPUReqPort::recvTimingResp(PacketPtr pkt) -{ - panic("recvTimingResp called on the request port."); -} - -void -MPU::MPUReqPort::recvReqRetry() -{ - panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); - - _blocked = false; - sendPacket(blockedPacket); - - if (!blocked()) { - blockedPacket = nullptr; - } -} - -void -MPU::MPUMemPort::sendPacket(PacketPtr pkt) -{ - panic_if(_blocked, "Should never try to send if blocked MemSide!"); - // If we can't send the packet across the port, store it for later. - if (!sendTimingReq(pkt)) - { - blockedPacket = pkt; - _blocked = true; - } -} - -bool -MPU::MPUMemPort::recvTimingResp(PacketPtr pkt) -{ - panic("recvTimingResp called on MPU::MPUMemPort memPort."); -} - -void -MPU::MPUMemPort::recvReqRetry() -{ - panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); - - _blocked = false; - sendPacket(blockedPacket); - - if (!blocked()) { - blockedPacket = nullptr; - } -} - -AddrRangeList -MPU::getAddrRanges() -{ - return memPort.getAddrRanges(); -} - -void -MPU::recvFunctional(PacketPtr pkt) -{ - if (pkt->cmd == MemCmd::UpdateWL) { - panic("Functional requests should not be made to WL."); - //TODO: Might be a good idea to implement later. - // wlEngine->recvFunctional(pkt); - } else { - memPort.sendFunctional(pkt); - } -} - -bool -MPU::handleWLUpdate(PacketPtr pkt) -{ - return wlEngine->handleWLUpdate(pkt); -} - -bool -MPU::recvPushUpdate(PacketPtr pkt) -{ - Addr addr = pkt->getAddr(); - for (auto addr_range: memPort.getAddrRanges()) { - if (addr_range.contains(addr)) { - return handleWLUpdate(pkt); - } - } - if (!reqPort.blocked()) { - reqPort.sendPacket(pkt); - return true; - } - return true; -} - -} diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh deleted file mode 100644 index a0472eead5..0000000000 --- a/src/accl/graph/sega/mpu.hh +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __ACCL_GRAPH_SEGA_MPU_HH__ -#define __ACCL_GRAPH_SEGA_MPU_HH__ - -#include "accl/graph/sega/wl_engine.hh" -#include "base/addr_range.hh" -#include "mem/port.hh" -#include "mem/packet.hh" -#include "params/MPU.hh" -#include "sim/clocked_object.hh" - -namespace gem5 -{ - -class MPU : public ClockedObject -{ - private: - class MPURespPort : public ResponsePort - { - private: - MPU* owner; - - public: - MPURespPort(const std::string& name, MPU* owner): - ResponsePort(name, owner), owner(owner) - {} - virtual AddrRangeList getAddrRanges() const; - - protected: - virtual bool recvTimingReq(PacketPtr pkt); - virtual Tick recvAtomic(PacketPtr pkt); - virtual void recvFunctional(PacketPtr pkt); - virtual void recvRespRetry(); - }; - - class MPUReqPort : public RequestPort - { - private: - MPU* owner; - bool _blocked; - PacketPtr blockedPacket; - - public: - MPUReqPort(const std::string& name, MPU* owner): - RequestPort(name, owner), owner(owner), - _blocked(false), blockedPacket(nullptr) - {} - void sendPacket(PacketPtr pkt); - bool blocked() { return _blocked; } - - protected: - virtual bool recvTimingResp(PacketPtr pkt); - virtual void recvReqRetry(); - }; - - class MPUMemPort : public RequestPort - { - private: - MPU* owner; - bool _blocked; - PacketPtr blockedPacket; - - public: - MPUMemPort(const std::string& name, MPU* owner): - RequestPort(name, owner), owner(owner), - _blocked(false), blockedPacket(nullptr) - {} - - void sendPacket(PacketPtr pkt); - bool blocked() { return _blocked; } - - protected: - virtual bool recvTimingResp(PacketPtr pkt); - virtual void recvReqRetry(); - }; - - MPURespPort respPort; - MPUReqPort reqPort; - MPUMemPort memPort; - - WLEngine* wlEngine; - - virtual void startup(); - AddrRangeList getAddrRanges(); - void recvFunctional(PacketPtr pkt); - - public: - PARAMS(MPU); - MPU(const MPUParams ¶ms); - - Port& getPort(const std::string &if_name, - PortID idx=InvalidPortID) override; - - bool handleWLUpdate(PacketPtr pkt); - bool recvPushUpdate(PacketPtr pkt); -}; - -} - -#endif // __ACCL_GRAPH_SEGA_MPU_HH__ diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index a1fa86da2b..c7b229ad33 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -33,13 +33,58 @@ namespace gem5 PushEngine::PushEngine(const PushEngineParams ¶ms) : BasePushEngine(params), - mpu(params.mpu) + reqPort(name() + "reqPort", this) {} +Port& +PushEngine::getPort(const std::string &if_name, PortID idx) +{ + if (if_name == "req_port") { + return reqPort; + } else { + return BasePushEngine::getPort(if_name, idx); + } +} + +void +PushEngine::ReqPort::sendPacket(PacketPtr pkt) +{ + panic_if(_blocked, "Should never try to send if blocked MemSide!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + blockedPacket = pkt; + _blocked = true; + } +} + +bool +PushEngine::ReqPort::recvTimingResp(PacketPtr pkt) +{ + panic("recvTimingResp called on the request port."); +} + +void +PushEngine::ReqPort::recvReqRetry() +{ + panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); + + _blocked = false; + sendPacket(blockedPacket); + + if (!blocked()) { + blockedPacket = nullptr; + } +} + bool PushEngine::sendPushUpdate(PacketPtr pkt) { - return mpu->recvPushUpdate(pkt); + if (!reqPort.blocked()) { + reqPort.sendPacket(pkt); + return true; + } + return false; } } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index edf698011d..604df4750d 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -30,7 +30,6 @@ #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ #include "accl/graph/base/base_push_engine.hh" -#include "accl/graph/sega/mpu.hh" #include "params/PushEngine.hh" namespace gem5 @@ -41,14 +40,36 @@ class MPU; class PushEngine : public BasePushEngine { private: - MPU* mpu; + class ReqPort : public RequestPort + { + private: + PushEngine* owner; + bool _blocked; + PacketPtr blockedPacket; + + public: + ReqPort(const std::string& name, PushEngine* owner) : + RequestPort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} + void sendPacket(PacketPtr pkt); + bool blocked() { return _blocked; } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + ReqPort reqPort; protected: - virtual bool sendPushUpdate(PacketPtr pkt); + virtual bool sendPushUpdate(PacketPtr pkt) override; public: PARAMS(PushEngine); PushEngine(const PushEngineParams ¶ms); + Port& getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; }; } diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 823aa49bb9..e565ac119b 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -33,12 +33,100 @@ namespace gem5 WLEngine::WLEngine(const WLEngineParams ¶ms): BaseWLEngine(params), + respPort(name() + ".respPort", this), applyEngine(params.apply_engine) {} +Port& +WLEngine::getPort(const std::string &if_name, PortID idx) +{ + if (if_name == "resp_port") { + return respPort; + } else { + return BaseWLEngine::getPort(if_name, idx); + } +} + +void +WLEngine::startup() +{ + //FIXME: This is the current version of our initializer. + // This should be updated in the future. + WorkListItem vertices [5] = { + {0, 0, 3, 0}, // Addr: 0 + {0, 0, 1, 3}, // Addr: 16 + {0, 0, 1, 4}, // Addr: 32 + {0, 0, 0, 5}, // Addr: 48 + {0, 0, 0, 5} // Addr: 64 + }; + Edge edges [6] = { + {0, 16}, // Addr: 1048576 + {0, 32}, // Addr: 1048592 + {0, 48}, // Addr: 1048608 + {0, 32}, // Addr: 1048624 + {0, 64} // Addr: 1048640 + }; + + for (int i = 0; i < 5; i++) { + uint8_t* data = workListToMemory(vertices[i]); + PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem), + 16, data, 0); + sendMemFunctional(pkt); + } + + for (int i = 0; i < 6; i++) { + uint8_t* data = edgeToMemory(edges[i]); + PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge), + 16, data, 0); + sendMemFunctional(pkt); + } +} + bool WLEngine::sendWLNotif(Addr addr){ return applyEngine->recvWLNotif(addr); } +AddrRangeList +WLEngine::RespPort::getAddrRanges() const +{ + return owner->getAddrRanges(); +} + +bool +WLEngine::RespPort::recvTimingReq(PacketPtr pkt) +{ + return owner->handleWLUpdate(pkt); +} + +Tick +WLEngine::RespPort::recvAtomic(PacketPtr pkt) +{ + panic("recvAtomic unimpl."); +} + +void +WLEngine::RespPort::recvFunctional(PacketPtr pkt) +{ + owner->recvFunctional(pkt); +} + +void +WLEngine::RespPort::recvRespRetry() +{ + panic("recvRespRetry from response port is called."); +} + +void +WLEngine::recvFunctional(PacketPtr pkt) +{ + if (pkt->cmd == MemCmd::UpdateWL) { + panic("Functional requests should not be made to WL."); + //TODO: Might be a good idea to implement later. + // wlEngine->recvFunctional(pkt); + } else { + sendMemFunctional(pkt); + } +} + } diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 6946713aaa..f895a7ad32 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -34,14 +34,7 @@ #include "accl/graph/base/base_wl_engine.hh" #include "accl/graph/sega/apply_engine.hh" -#include "base/addr_range.hh" -#include "mem/port.hh" -#include "mem/packet.hh" #include "params/WLEngine.hh" -#include "sim/clocked_object.hh" -#include "sim/port.hh" -#include "sim/system.hh" - namespace gem5 { @@ -51,14 +44,39 @@ class ApplyEngine; class WLEngine : public BaseWLEngine { private: + class RespPort : public ResponsePort + { + private: + WLEngine* owner; + + public: + RespPort(const std::string& name, WLEngine* owner): + ResponsePort(name, owner), owner(owner) + {} + virtual AddrRangeList getAddrRanges() const; + + protected: + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt); + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRespRetry(); + }; + + RespPort respPort; ApplyEngine* applyEngine; + + virtual void startup(); + void recvFunctional(PacketPtr pkt); + protected: - virtual bool sendWLNotif(Addr addr); + virtual bool sendWLNotif(Addr addr) override; public: PARAMS(WLEngine); WLEngine(const WLEngineParams ¶ms); + Port& getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; }; } From d66efdf5a3e2e2fc4d425ad2f80ab22da10a19a5 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 23 Feb 2022 18:23:54 -0800 Subject: [PATCH 059/247] Started fixing memory leak. --- src/accl/graph/base/base_apply_engine.cc | 6 +++--- src/accl/graph/base/base_push_engine.cc | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc index e7b7dd6a22..7b643969df 100644 --- a/src/accl/graph/base/base_apply_engine.cc +++ b/src/accl/graph/base/base_apply_engine.cc @@ -61,9 +61,9 @@ BaseApplyEngine::processNextApplyCheckEvent() Addr addr = applyReadQueue.front(); Addr req_addr = (addr / 64) * 64; Addr req_offset = (addr % 64); - RequestPtr request = std::make_shared(req_addr, 64, 0 ,0); - PacketPtr memPkt = new Packet(request, MemCmd::ReadReq); - requestOffset[request] = req_offset; + + PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId); + requestOffset[memPkt->req] = req_offset; if (!memPortBlocked()) { sendMemReq(memPkt); applyReadQueue.pop(); diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc index a963cc9709..6e5aa05779 100644 --- a/src/accl/graph/base/base_push_engine.cc +++ b/src/accl/graph/base/base_push_engine.cc @@ -112,7 +112,8 @@ BasePushEngine::processNextPushEvent() for (int i = 0; i < num_edges; i++) { uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes); Edge e = memoryToEdge(curr_edge_data); - uint32_t *update_data = new uint32_t; + int data_size = sizeof(uint32_t) / sizeof(uint8_t); + uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]); // TODO: Implement propagate function here *update_data = value + 1; From df1340a91e5262a0d97faed7ffd39bf1e62af840 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 23 Feb 2022 18:35:05 -0800 Subject: [PATCH 060/247] Adding newlines. --- configs/accl/sega.py | 2 +- src/accl/graph/sega/apply_engine.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index ea158ecdc9..54970d356e 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -49,4 +49,4 @@ def __init__(self): exit_event = m5.simulate() print("Simulation finished!") -exit() \ No newline at end of file +exit() diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc index bc3d703cf6..5d5f8daf26 100644 --- a/src/accl/graph/sega/apply_engine.cc +++ b/src/accl/graph/sega/apply_engine.cc @@ -42,4 +42,4 @@ ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) } -} \ No newline at end of file +} From ef0f9669a303035981a9ffc298b4acdf275d1ffc Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Thu, 24 Feb 2022 11:43:26 -0800 Subject: [PATCH 061/247] Removed the UpdateWL from the MemCmd. --- src/accl/graph/base/util.cc | 3 ++- src/accl/graph/sega/wl_engine.cc | 13 +++++++------ src/mem/packet.hh | 2 +- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/accl/graph/base/util.cc b/src/accl/graph/base/util.cc index 0baa374714..4172607ed0 100644 --- a/src/accl/graph/base/util.cc +++ b/src/accl/graph/base/util.cc @@ -133,7 +133,8 @@ getUpdatePacket(Addr addr, unsigned int size, // bits req->setPC(((Addr)requestorId) << 2); - PacketPtr pkt = new Packet(req, MemCmd::UpdateWL); + // FIXME: MemCmd::UpdateWL + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); pkt->allocate(); pkt->setData(data); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index e565ac119b..f3c63e71f3 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -120,13 +120,14 @@ WLEngine::RespPort::recvRespRetry() void WLEngine::recvFunctional(PacketPtr pkt) { - if (pkt->cmd == MemCmd::UpdateWL) { - panic("Functional requests should not be made to WL."); - //TODO: Might be a good idea to implement later. - // wlEngine->recvFunctional(pkt); - } else { + // FIXME: This needs to be fixed + // if (pkt->cmd == MemCmd::UpdateWL) { + // panic("Functional requests should not be made to WL."); + // //TODO: Might be a good idea to implement later. + // // wlEngine->recvFunctional(pkt); + // } else { sendMemFunctional(pkt); - } + // } } } diff --git a/src/mem/packet.hh b/src/mem/packet.hh index 5332ee32a2..a67abbbbaa 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -149,7 +149,7 @@ class MemCmd // Tlb shootdown TlbiExtSync, // MPU Accelerator - UpdateWL, + // UpdateWL, NUM_MEM_CMDS }; From acfffa3e25a866c6dc3aaa844ac195e530a44096 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 25 Feb 2022 11:49:51 -0800 Subject: [PATCH 062/247] Adding initial update. Fixing some bugs. --- src/accl/graph/base/base_wl_engine.cc | 2 +- src/accl/graph/sega/wl_engine.cc | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc index ef66603de7..1b9d92c1b4 100644 --- a/src/accl/graph/base/base_wl_engine.cc +++ b/src/accl/graph/base/base_wl_engine.cc @@ -62,7 +62,7 @@ void BaseWLEngine::processNextWLReadEvent() requestOffsetMap[memPkt->req] = req_offset; requestValueMap[memPkt->req] = value; - if (memPortBlocked()) { + if (!memPortBlocked()) { sendMemReq(memPkt); updateQueue.pop(); } diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index f3c63e71f3..61bee38c05 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -80,6 +80,15 @@ WLEngine::startup() 16, data, 0); sendMemFunctional(pkt); } + + uint8_t* first_update_data = new uint8_t [4]; + uint32_t* tempPtr = (uint32_t*) first_update_data; + *tempPtr = 0; + + PacketPtr first_update = getUpdatePacket( + 0, 4, first_update_data, requestorId); + + handleWLUpdate(first_update); } bool From 75825c3de944037f32c8b21d73106bcac77cbb00 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Fri, 25 Feb 2022 13:35:24 -0800 Subject: [PATCH 063/247] Adding few debugging flags. --- src/accl/graph/base/SConscript | 2 ++ src/accl/graph/base/base_apply_engine.cc | 7 +++++++ src/accl/graph/base/base_push_engine.cc | 5 ++++- src/accl/graph/base/base_wl_engine.cc | 6 ++++++ src/accl/graph/sega/wl_engine.cc | 2 +- 5 files changed, 20 insertions(+), 2 deletions(-) diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript index 41c48fc419..c5c8c4e901 100644 --- a/src/accl/graph/base/SConscript +++ b/src/accl/graph/base/SConscript @@ -37,3 +37,5 @@ Source('base_engine.cc') Source('base_push_engine.cc') Source('base_wl_engine.cc') Source('util.cc') + +DebugFlag('MPU') diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc index 7b643969df..5eb9d90059 100644 --- a/src/accl/graph/base/base_apply_engine.cc +++ b/src/accl/graph/base/base_apply_engine.cc @@ -31,6 +31,8 @@ #include #include "accl/graph/base/util.hh" +#include "debug/MPU.hh" + namespace gem5 { @@ -83,6 +85,8 @@ BaseApplyEngine::processNextApplyEvent() Addr request_offset = requestOffset[request]; WorkListItem wl = memoryToWorkList(data + request_offset); + DPRINTF(MPU, "%s: Apply Engine is reading WorkList Item: %s\n" + , __func__, wl.to_string()); // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem // to applyengine if temp_prop < prop. If temp_prop has not changed, why // fwd it to applyengine? @@ -101,6 +105,9 @@ BaseApplyEngine::processNextApplyEvent() if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) { sendMemReq(writePkt); memRespQueue.pop(); + DPRINTF(MPU, "%s: The Apply Engine is applying the new value", + "into WorkList Item: %s\n" + , __func__, wl.to_string()); } } } else { diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc index 6e5aa05779..f46941b8ed 100644 --- a/src/accl/graph/base/base_push_engine.cc +++ b/src/accl/graph/base/base_push_engine.cc @@ -29,6 +29,7 @@ #include "accl/graph/base/base_push_engine.hh" #include "accl/graph/base/util.hh" +#include "debug/MPU.hh" namespace gem5 { @@ -47,6 +48,7 @@ BasePushEngine::recvApplyNotif(uint32_t prop, if (!nextReadEvent.scheduled()) { schedule(nextReadEvent, nextCycle()); } + DPRINTF(MPU, "%s: Reading %d edges.", __func__, degree); return true; } @@ -114,7 +116,6 @@ BasePushEngine::processNextPushEvent() Edge e = memoryToEdge(curr_edge_data); int data_size = sizeof(uint32_t) / sizeof(uint8_t); uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]); - // TODO: Implement propagate function here *update_data = value + 1; PacketPtr update = getUpdatePacket(e.neighbor, @@ -122,6 +123,8 @@ BasePushEngine::processNextPushEvent() requestorId); if (sendPushUpdate(update)) { memRespQueue.pop(); + DPRINTF(MPU, "%s: Reading %s, updating with %d\n" + , __func__, e.to_string(), *update_data); // TODO: Erase map entries here. } } diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc index 1b9d92c1b4..38ebf0f35b 100644 --- a/src/accl/graph/base/base_wl_engine.cc +++ b/src/accl/graph/base/base_wl_engine.cc @@ -27,6 +27,7 @@ */ #include "accl/graph/base/base_wl_engine.hh" +#include "debug/MPU.hh" #include @@ -80,6 +81,8 @@ BaseWLEngine::processNextWLReduceEvent() uint32_t value = requestValueMap[resp->req]; WorkListItem wl = memoryToWorkList(respData + request_offset); + DPRINTF(MPU, "%s: The WLE is reading WorkList item: %s\n" + , __func__, wl.to_string()); if (value < wl.temp_prop){ //update prop with temp_prop wl.temp_prop = value; @@ -89,10 +92,13 @@ BaseWLEngine::processNextWLReduceEvent() PacketPtr writePkt = getWritePacket(resp->getAddr(), 64, respData, requestorId); + if (!memPortBlocked()) { if (sendWLNotif(resp->getAddr() + request_offset)) { sendMemReq(writePkt); memRespQueue.pop(); + DPRINTF(MPU, "%s: The WLE is chanching to: %s\n" + , __func__, wl.to_string()); // TODO: Erase map entries, delete wlData; } } diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 61bee38c05..674004d7a5 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -27,7 +27,7 @@ */ #include "accl/graph/sega/wl_engine.hh" - +#include "debug/MPU.hh" namespace gem5 { From d3f342cab70cc838b254365789afe4947d6677bc Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 28 Feb 2022 15:04:53 -0800 Subject: [PATCH 064/247] Adding lock_dir. --- configs/accl/sega.py | 6 +- src/accl/graph/base/base_apply_engine.cc | 19 ++++--- src/accl/graph/base/base_apply_engine.hh | 3 +- src/accl/graph/base/base_wl_engine.cc | 23 +++++--- src/accl/graph/base/base_wl_engine.hh | 2 + src/accl/graph/sega/ApplyEngine.py | 1 + src/accl/graph/sega/LockDir.py | 46 +++++++++++++++ src/accl/graph/sega/SConscript | 2 + src/accl/graph/sega/WLEngine.py | 1 + src/accl/graph/sega/apply_engine.cc | 15 ++++- src/accl/graph/sega/apply_engine.hh | 4 ++ src/accl/graph/sega/lock_dir.cc | 71 ++++++++++++++++++++++++ src/accl/graph/sega/lock_dir.hh | 57 +++++++++++++++++++ src/accl/graph/sega/wl_engine.cc | 15 ++++- src/accl/graph/sega/wl_engine.hh | 5 +- 15 files changed, 248 insertions(+), 22 deletions(-) create mode 100644 src/accl/graph/sega/LockDir.py create mode 100644 src/accl/graph/sega/lock_dir.cc create mode 100644 src/accl/graph/sega/lock_dir.hh diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 54970d356e..db0bf4678f 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -4,11 +4,13 @@ class MPU(SubSystem): def __init__(self): super(MPU, self).__init__() + self.lock_dir = LockDirectory() self.push_engine = PushEngine() - self.apply_engine = ApplyEngine(push_engine = self.push_engine) - self.wl_engine = WLEngine(apply_engine = self.apply_engine) + self.apply_engine = ApplyEngine(push_engine = self.push_engine, lock_dir = self.lock_dir) + self.wl_engine = WLEngine(apply_engine = self.apply_engine, lock_dir = self.lock_dir) self.interconnect = SystemXBar() + self.interconnect.cpu_side_ports = self.wl_engine.mem_port self.interconnect.cpu_side_ports = self.apply_engine.mem_port self.interconnect.cpu_side_ports = self.push_engine.mem_port diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc index 5eb9d90059..890d5dd313 100644 --- a/src/accl/graph/base/base_apply_engine.cc +++ b/src/accl/graph/base/base_apply_engine.cc @@ -61,14 +61,16 @@ BaseApplyEngine::processNextApplyCheckEvent() // pops items off queue, maybe we should pop every n cycles // or change the clock domain for this simobject. Addr addr = applyReadQueue.front(); - Addr req_addr = (addr / 64) * 64; - Addr req_offset = (addr % 64); + if (acquireAddress(addr)) { + Addr req_addr = (addr / 64) * 64; + Addr req_offset = (addr % 64); - PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId); - requestOffset[memPkt->req] = req_offset; - if (!memPortBlocked()) { - sendMemReq(memPkt); - applyReadQueue.pop(); + PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId); + requestOffset[memPkt->req] = req_offset; + if (!memPortBlocked()) { + sendMemReq(memPkt); + applyReadQueue.pop(); + } } if (!applyReadQueue.empty() && !nextApplyCheckEvent.scheduled()){ schedule(nextApplyCheckEvent, nextCycle()); @@ -113,6 +115,9 @@ BaseApplyEngine::processNextApplyEvent() } else { memRespQueue.pop(); } + if (!releaseAddress(pkt->getAddr())) { + panic("Could not release an address"); + } if (!nextApplyEvent.scheduled() && !memRespQueue.empty()){ schedule(nextApplyEvent, nextCycle()); } diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh index 9111bd074b..f4df298079 100644 --- a/src/accl/graph/base/base_apply_engine.hh +++ b/src/accl/graph/base/base_apply_engine.hh @@ -55,7 +55,8 @@ class BaseApplyEngine : public BaseEngine protected: virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0; - + virtual bool acquireAddress(Addr addr) = 0; + virtual bool releaseAddress(Addr addr) = 0; virtual void scheduleMainEvent() override; public: diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc index 38ebf0f35b..7f1a27aae5 100644 --- a/src/accl/graph/base/base_wl_engine.cc +++ b/src/accl/graph/base/base_wl_engine.cc @@ -56,16 +56,18 @@ void BaseWLEngine::processNextWLReadEvent() uint32_t value = *(pkt->getPtr()); Addr addr = pkt->getAddr(); - Addr req_addr = (addr / 64) * 64; - Addr req_offset = addr % 64; + if (acquireAddress(addr)) { + Addr req_addr = (addr / 64) * 64; + Addr req_offset = addr % 64; - PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId); - requestOffsetMap[memPkt->req] = req_offset; - requestValueMap[memPkt->req] = value; + PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId); + requestOffsetMap[memPkt->req] = req_offset; + requestValueMap[memPkt->req] = value; - if (!memPortBlocked()) { - sendMemReq(memPkt); - updateQueue.pop(); + if (!memPortBlocked()) { + sendMemReq(memPkt); + updateQueue.pop(); + } } if (!nextWLReadEvent.scheduled() && !updateQueue.empty()) { schedule(nextWLReadEvent, nextCycle()); @@ -92,7 +94,6 @@ BaseWLEngine::processNextWLReduceEvent() PacketPtr writePkt = getWritePacket(resp->getAddr(), 64, respData, requestorId); - if (!memPortBlocked()) { if (sendWLNotif(resp->getAddr() + request_offset)) { sendMemReq(writePkt); @@ -106,6 +107,10 @@ BaseWLEngine::processNextWLReduceEvent() else { memRespQueue.pop(); } + if (!releaseAddress(resp->getAddr())) { + panic("Could not release an address"); + } + std::cout << "success" << std::endl; if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){ schedule(nextWLReduceEvent, nextCycle()); } diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh index 38079f8f94..15371f965b 100644 --- a/src/accl/graph/base/base_wl_engine.hh +++ b/src/accl/graph/base/base_wl_engine.hh @@ -65,6 +65,8 @@ class BaseWLEngine : public BaseEngine */ protected: virtual bool sendWLNotif(Addr addr) = 0; + virtual bool acquireAddress(Addr addr) = 0; + virtual bool releaseAddress(Addr addr) = 0; virtual void scheduleMainEvent() override; public: diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/ApplyEngine.py index 5bb0dc0c25..7a446bb620 100644 --- a/src/accl/graph/sega/ApplyEngine.py +++ b/src/accl/graph/sega/ApplyEngine.py @@ -35,3 +35,4 @@ class ApplyEngine(BaseApplyEngine): cxx_class = 'gem5::ApplyEngine' push_engine = Param.PushEngine(Parent.any, "MPU object that owns this ApplyEngine") + lock_dir = Param.LockDirectory(NULL, "The lock directory to acquire locks from") diff --git a/src/accl/graph/sega/LockDir.py b/src/accl/graph/sega/LockDir.py new file mode 100644 index 0000000000..d21963dc3a --- /dev/null +++ b/src/accl/graph/sega/LockDir.py @@ -0,0 +1,46 @@ +# Copyright (c) 2012-2014, 2017-2018 ARM Limited +# All rights reserved. +# +# The license below extends only to copyright in the software and shall +# not be construed as granting a license to any other intellectual +# property including but not limited to intellectual property relating +# to a hardware implementation of the functionality of the software +# licensed hereunder. You may use the software subject to the license +# terms below provided that you ensure that this notice is replicated +# unmodified and in its entirety in all distributions of the software, +# modified or unmodified, in source code or in binary form. +# +# Copyright (c) 2007 The Regents of The University of Michigan +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.SimObject import SimObject + +class LockDirectory(SimObject): + type = 'LockDirectory' + cxx_header = 'accl/graph/sega/lock_dir.hh' + cxx_class = 'gem5::LockDirectory' diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index f20d0e44df..e6d2f1fbbc 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -28,9 +28,11 @@ Import('*') SimObject('ApplyEngine.py') +SimObject('LockDir.py') SimObject('PushEngine.py') SimObject('WLEngine.py') Source('apply_engine.cc') +Source('lock_dir.cc') Source('push_engine.cc') Source('wl_engine.cc') diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py index 2d650ecb92..b6e697266e 100644 --- a/src/accl/graph/sega/WLEngine.py +++ b/src/accl/graph/sega/WLEngine.py @@ -37,3 +37,4 @@ class WLEngine(BaseWLEngine): resp_port = ResponsePort("Port to Receive updates from outside") apply_engine = Param.ApplyEngine(Parent.any, "MPU object that owns this WLEngine") + lock_dir = Param.LockDirectory(NULL, "The lock directory to acquire locks from") diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc index 5d5f8daf26..544bb082ad 100644 --- a/src/accl/graph/sega/apply_engine.cc +++ b/src/accl/graph/sega/apply_engine.cc @@ -32,7 +32,8 @@ namespace gem5{ ApplyEngine::ApplyEngine(const ApplyEngineParams ¶ms) : BaseApplyEngine(params), - pushEngine(params.push_engine) + pushEngine(params.push_engine), + lockDir(params.lock_dir) {} bool @@ -42,4 +43,16 @@ ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) } +bool +ApplyEngine::acquireAddress(Addr addr) +{ + return lockDir->acquire(addr, requestorId); +} + +bool +ApplyEngine::releaseAddress(Addr addr) +{ + return lockDir->release(addr, requestorId); +} + } diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh index 1190786e36..c88330487a 100644 --- a/src/accl/graph/sega/apply_engine.hh +++ b/src/accl/graph/sega/apply_engine.hh @@ -33,6 +33,7 @@ #include #include "accl/graph/base/base_apply_engine.hh" +#include "accl/graph/sega/lock_dir.hh" #include "accl/graph/sega/push_engine.hh" #include "mem/packet.hh" #include "mem/port.hh" @@ -48,10 +49,13 @@ class ApplyEngine : public BaseApplyEngine { private: PushEngine* pushEngine; + LockDirectory* lockDir; protected: virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) override; + virtual bool acquireAddress(Addr addr) override; + virtual bool releaseAddress(Addr addr) override; public: PARAMS(ApplyEngine); diff --git a/src/accl/graph/sega/lock_dir.cc b/src/accl/graph/sega/lock_dir.cc new file mode 100644 index 0000000000..b7efa638fe --- /dev/null +++ b/src/accl/graph/sega/lock_dir.cc @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/lock_dir.hh" + +namespace gem5 +{ + +LockDirectory::LockDirectory(const LockDirectoryParams ¶ms) : + SimObject(params) +{} + +bool +LockDirectory::acquire(Addr addr, RequestorID requestorId) +{ + if (lockOwnerMap.find(addr) == lockOwnerMap.end()) { + lockOwnerMap[addr] = requestorId; + lockDegreeMap[addr] = 1; + return true; + } else if (lockOwnerMap[addr] == requestorId) { + lockDegreeMap[addr] = lockDegreeMap[addr] + 1; + return true; + } else { + return false; + } +} + +bool +LockDirectory::release(Addr addr, RequestorID requestorId) +{ + if (lockOwnerMap.find(addr) == lockOwnerMap.end()) { + panic("Should not relase an address before acquiring"); + } else if (lockOwnerMap[addr] != requestorId) { + panic("Should not release and address you don't own"); + } else { + lockDegreeMap[addr] = lockDegreeMap[addr] - 1; + if (lockDegreeMap[addr] == 0) { + lockDegreeMap.erase(addr); + lockOwnerMap.erase(addr); + return true; + } + } + return false; +} + +} diff --git a/src/accl/graph/sega/lock_dir.hh b/src/accl/graph/sega/lock_dir.hh new file mode 100644 index 0000000000..64d934d42f --- /dev/null +++ b/src/accl/graph/sega/lock_dir.hh @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_LOCK_DIR_HH__ +#define __ACCL_GRAPH_SEGA_LOCK_DIR_HH__ + +#include + +#include "mem/packet.hh" +#include "params/LockDirectory.hh" +#include "sim/sim_object.hh" + +namespace gem5 +{ + +class LockDirectory: public SimObject +{ + private: + std::unordered_map lockOwnerMap; + std::unordered_map lockDegreeMap; + + public: + PARAMS(LockDirectory); + LockDirectory(const LockDirectoryParams ¶ms); + + bool acquire(Addr addr, RequestorID requestorId); + bool release(Addr addr, RequestorID requestorId); +}; + +} + +#endif diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 674004d7a5..e557a08c18 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -34,7 +34,8 @@ namespace gem5 WLEngine::WLEngine(const WLEngineParams ¶ms): BaseWLEngine(params), respPort(name() + ".respPort", this), - applyEngine(params.apply_engine) + applyEngine(params.apply_engine), + lockDir(params.lock_dir) {} Port& @@ -139,4 +140,16 @@ WLEngine::recvFunctional(PacketPtr pkt) // } } +bool +WLEngine::acquireAddress(Addr addr) +{ + return lockDir->acquire(addr, requestorId); +} + +bool +WLEngine::releaseAddress(Addr addr) +{ + return lockDir->release(addr, requestorId); +} + } diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index f895a7ad32..4e8a25795a 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -34,6 +34,7 @@ #include "accl/graph/base/base_wl_engine.hh" #include "accl/graph/sega/apply_engine.hh" +#include "accl/graph/sega/lock_dir.hh" #include "params/WLEngine.hh" namespace gem5 @@ -64,13 +65,15 @@ class WLEngine : public BaseWLEngine RespPort respPort; ApplyEngine* applyEngine; - + LockDirectory* lockDir; virtual void startup(); void recvFunctional(PacketPtr pkt); protected: virtual bool sendWLNotif(Addr addr) override; + virtual bool acquireAddress(Addr addr) override; + virtual bool releaseAddress(Addr addr) override; public: PARAMS(WLEngine); From eb63831b87d00aed4447daaa7855fd5641e6de3f Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Mon, 28 Feb 2022 15:42:20 -0800 Subject: [PATCH 065/247] Debugging --- src/accl/graph/base/base_wl_engine.cc | 6 +++--- src/accl/graph/sega/wl_engine.cc | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc index 7f1a27aae5..f5d739da2d 100644 --- a/src/accl/graph/base/base_wl_engine.cc +++ b/src/accl/graph/base/base_wl_engine.cc @@ -83,8 +83,8 @@ BaseWLEngine::processNextWLReduceEvent() uint32_t value = requestValueMap[resp->req]; WorkListItem wl = memoryToWorkList(respData + request_offset); - DPRINTF(MPU, "%s: The WLE is reading WorkList item: %s\n" - , __func__, wl.to_string()); + DPRINTF(MPU, "%s: The WLE is reading WorkList item: %s %d\n" + , __func__, wl.to_string(), value); if (value < wl.temp_prop){ //update prop with temp_prop wl.temp_prop = value; @@ -110,7 +110,7 @@ BaseWLEngine::processNextWLReduceEvent() if (!releaseAddress(resp->getAddr())) { panic("Could not release an address"); } - std::cout << "success" << std::endl; + std::cout << "success "<< memRespQueue.size() << std::endl; if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){ schedule(nextWLReduceEvent, nextCycle()); } diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index e557a08c18..a84ed2d52f 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -54,11 +54,11 @@ WLEngine::startup() //FIXME: This is the current version of our initializer. // This should be updated in the future. WorkListItem vertices [5] = { - {0, 0, 3, 0}, // Addr: 0 - {0, 0, 1, 3}, // Addr: 16 - {0, 0, 1, 4}, // Addr: 32 - {0, 0, 0, 5}, // Addr: 48 - {0, 0, 0, 5} // Addr: 64 + {1000, 1000, 3, 0}, // Addr: 0 + {1000, 1000, 1, 3}, // Addr: 16 + {1000, 1000, 1, 4}, // Addr: 32 + {10000, 1000, 0, 5}, // Addr: 48 + {10000, 10000, 0, 5} // Addr: 64 }; Edge edges [6] = { {0, 16}, // Addr: 1048576 From 4d137d8c5389fb4dd28d4ca6a7e49df1184b9d9b Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 28 Feb 2022 15:53:38 -0800 Subject: [PATCH 066/247] More debugging. --- configs/accl/sega.py | 3 ++- src/accl/graph/base/base_engine.cc | 3 +++ src/accl/graph/sega/lock_dir.cc | 12 ++---------- src/accl/graph/sega/lock_dir.hh | 2 +- 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index db0bf4678f..db5a36b987 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -39,7 +39,8 @@ def __init__(self): self.clk_domain.voltage_domain = VoltageDomain() self.mpu = MPU() - self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB"))) + self.mem_ctrl = SimpleMemory(range=AddrRange("4GiB"), bandwidth="1000GB/s", latency = "30ns") + # self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB"))) self.mpu.setReqPort(self.mpu.getRespPort()) self.mpu.setMemPort(self.mem_ctrl.port) diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc index 6b40ba4137..f449e6ffdb 100644 --- a/src/accl/graph/base/base_engine.cc +++ b/src/accl/graph/base/base_engine.cc @@ -87,6 +87,9 @@ BaseEngine::MemPort::recvReqRetry() bool BaseEngine::handleMemResp(PacketPtr pkt) { + if (pkt->isResponse() && pkt->isWrite()) { + return true; + } memRespQueue.push(pkt); scheduleMainEvent(); return true; diff --git a/src/accl/graph/sega/lock_dir.cc b/src/accl/graph/sega/lock_dir.cc index b7efa638fe..6a4496175d 100644 --- a/src/accl/graph/sega/lock_dir.cc +++ b/src/accl/graph/sega/lock_dir.cc @@ -40,10 +40,6 @@ LockDirectory::acquire(Addr addr, RequestorID requestorId) { if (lockOwnerMap.find(addr) == lockOwnerMap.end()) { lockOwnerMap[addr] = requestorId; - lockDegreeMap[addr] = 1; - return true; - } else if (lockOwnerMap[addr] == requestorId) { - lockDegreeMap[addr] = lockDegreeMap[addr] + 1; return true; } else { return false; @@ -58,12 +54,8 @@ LockDirectory::release(Addr addr, RequestorID requestorId) } else if (lockOwnerMap[addr] != requestorId) { panic("Should not release and address you don't own"); } else { - lockDegreeMap[addr] = lockDegreeMap[addr] - 1; - if (lockDegreeMap[addr] == 0) { - lockDegreeMap.erase(addr); - lockOwnerMap.erase(addr); - return true; - } + lockOwnerMap.erase(addr); + return true; } return false; } diff --git a/src/accl/graph/sega/lock_dir.hh b/src/accl/graph/sega/lock_dir.hh index 64d934d42f..012334ce43 100644 --- a/src/accl/graph/sega/lock_dir.hh +++ b/src/accl/graph/sega/lock_dir.hh @@ -42,7 +42,7 @@ class LockDirectory: public SimObject { private: std::unordered_map lockOwnerMap; - std::unordered_map lockDegreeMap; + // std::unordered_map lockDegreeMap; public: PARAMS(LockDirectory); From efcbae85fd36cae6477f1aa66b802f078ef87e2f Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 28 Feb 2022 16:34:09 -0800 Subject: [PATCH 067/247] Fixed the bugs. Simulation is an endless loop. --- configs/accl/sega.py | 2 +- src/accl/graph/base/base_apply_engine.cc | 7 +++---- src/accl/graph/base/base_engine.cc | 6 ++++-- src/accl/graph/base/base_push_engine.cc | 2 +- src/accl/graph/base/base_wl_engine.cc | 10 ++++------ 5 files changed, 13 insertions(+), 14 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index db5a36b987..163ea169d9 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -50,6 +50,6 @@ def __init__(self): m5.instantiate() -exit_event = m5.simulate() +exit_event = m5.simulate(1000000) print("Simulation finished!") exit() diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc index 890d5dd313..e222cb5a76 100644 --- a/src/accl/graph/base/base_apply_engine.cc +++ b/src/accl/graph/base/base_apply_engine.cc @@ -61,10 +61,9 @@ BaseApplyEngine::processNextApplyCheckEvent() // pops items off queue, maybe we should pop every n cycles // or change the clock domain for this simobject. Addr addr = applyReadQueue.front(); - if (acquireAddress(addr)) { - Addr req_addr = (addr / 64) * 64; - Addr req_offset = (addr % 64); - + Addr req_addr = (addr / 64) * 64; + Addr req_offset = (addr % 64); + if (acquireAddress(req_addr)) { PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId); requestOffset[memPkt->req] = req_offset; if (!memPortBlocked()) { diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc index f449e6ffdb..ad87bb3662 100644 --- a/src/accl/graph/base/base_engine.cc +++ b/src/accl/graph/base/base_engine.cc @@ -27,7 +27,7 @@ */ #include "accl/graph/base/base_engine.hh" - +#include "debug/MPU.hh" namespace gem5 { @@ -36,7 +36,9 @@ BaseEngine::BaseEngine(const BaseEngineParams ¶ms) : system(params.system), memPort(name() + ".memPort", this), requestorId(system->getRequestorId(this)) -{} +{ + DPRINTF(MPU, "%s: My requestorId is %u,\n", __func__, requestorId); +} BaseEngine::~BaseEngine() {} diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc index f46941b8ed..4ebe40e486 100644 --- a/src/accl/graph/base/base_push_engine.cc +++ b/src/accl/graph/base/base_push_engine.cc @@ -121,7 +121,7 @@ BasePushEngine::processNextPushEvent() PacketPtr update = getUpdatePacket(e.neighbor, sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data, requestorId); - if (sendPushUpdate(update)) { + if (sendPushUpdate(update) && (i == num_edges - 1)) { memRespQueue.pop(); DPRINTF(MPU, "%s: Reading %s, updating with %d\n" , __func__, e.to_string(), *update_data); diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc index f5d739da2d..921e9c683d 100644 --- a/src/accl/graph/base/base_wl_engine.cc +++ b/src/accl/graph/base/base_wl_engine.cc @@ -56,10 +56,9 @@ void BaseWLEngine::processNextWLReadEvent() uint32_t value = *(pkt->getPtr()); Addr addr = pkt->getAddr(); - if (acquireAddress(addr)) { - Addr req_addr = (addr / 64) * 64; - Addr req_offset = addr % 64; - + Addr req_addr = (addr / 64) * 64; + Addr req_offset = addr % 64; + if (acquireAddress(req_addr)) { PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId); requestOffsetMap[memPkt->req] = req_offset; requestValueMap[memPkt->req] = value; @@ -98,7 +97,7 @@ BaseWLEngine::processNextWLReduceEvent() if (sendWLNotif(resp->getAddr() + request_offset)) { sendMemReq(writePkt); memRespQueue.pop(); - DPRINTF(MPU, "%s: The WLE is chanching to: %s\n" + DPRINTF(MPU, "%s: The WLE is changing to: %s\n" , __func__, wl.to_string()); // TODO: Erase map entries, delete wlData; } @@ -110,7 +109,6 @@ BaseWLEngine::processNextWLReduceEvent() if (!releaseAddress(resp->getAddr())) { panic("Could not release an address"); } - std::cout << "success "<< memRespQueue.size() << std::endl; if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){ schedule(nextWLReduceEvent, nextCycle()); } From f0dadbb9eea953ca1b69cca3e7bbc3dd994d87e3 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Mon, 28 Feb 2022 18:34:18 -0800 Subject: [PATCH 068/247] Debugged: Releases the address when the memory is blocked. Added debugging flgs for validation. --- src/accl/graph/base/base_apply_engine.cc | 14 ++++++--- src/accl/graph/base/base_wl_engine.cc | 12 ++++++-- src/accl/graph/sega/wl_engine.cc | 17 ++++++----- src/mem/packet.cc | 39 ++++++++++++++++++++++++ src/mem/packet.hh | 2 ++ 5 files changed, 69 insertions(+), 15 deletions(-) diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc index e222cb5a76..39f5dafc67 100644 --- a/src/accl/graph/base/base_apply_engine.cc +++ b/src/accl/graph/base/base_apply_engine.cc @@ -86,8 +86,8 @@ BaseApplyEngine::processNextApplyEvent() Addr request_offset = requestOffset[request]; WorkListItem wl = memoryToWorkList(data + request_offset); - DPRINTF(MPU, "%s: Apply Engine is reading WorkList Item: %s\n" - , __func__, wl.to_string()); + DPRINTF(MPU, "%s: Apply Engine is reading WorkList Item[%lu]: %s\n" + , __func__, pkt->getAddr() + request_offset, wl.to_string()); // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem // to applyengine if temp_prop < prop. If temp_prop has not changed, why // fwd it to applyengine? @@ -102,13 +102,17 @@ BaseApplyEngine::processNextApplyEvent() PacketPtr writePkt = getWritePacket(pkt->getAddr(), 64, data, requestorId); + DPRINTF(MPU, "%s: Sending a pkt with this info. " + "pkt->addr: %lu, pkt->size: %lu\npkt->data: %s\n", + __func__, writePkt->getAddr(), + writePkt->getSize(), writePkt->printData()); + if (!memPortBlocked()) { if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) { sendMemReq(writePkt); memRespQueue.pop(); - DPRINTF(MPU, "%s: The Apply Engine is applying the new value", - "into WorkList Item: %s\n" - , __func__, wl.to_string()); + DPRINTF(MPU, "%s: The Apply Engine is applying the new value into WorkList Item[%lu]: %s\n" + , __func__, pkt->getAddr() + request_offset, wl.to_string()); } } } else { diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc index 921e9c683d..fd45b85077 100644 --- a/src/accl/graph/base/base_wl_engine.cc +++ b/src/accl/graph/base/base_wl_engine.cc @@ -58,6 +58,7 @@ void BaseWLEngine::processNextWLReadEvent() Addr addr = pkt->getAddr(); Addr req_addr = (addr / 64) * 64; Addr req_offset = addr % 64; + if (acquireAddress(req_addr)) { PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId); requestOffsetMap[memPkt->req] = req_offset; @@ -67,6 +68,9 @@ void BaseWLEngine::processNextWLReadEvent() sendMemReq(memPkt); updateQueue.pop(); } + else{ + releaseAddress(req_addr); + } } if (!nextWLReadEvent.scheduled() && !updateQueue.empty()) { schedule(nextWLReadEvent, nextCycle()); @@ -82,8 +86,8 @@ BaseWLEngine::processNextWLReduceEvent() uint32_t value = requestValueMap[resp->req]; WorkListItem wl = memoryToWorkList(respData + request_offset); - DPRINTF(MPU, "%s: The WLE is reading WorkList item: %s %d\n" - , __func__, wl.to_string(), value); + DPRINTF(MPU, "%s: The WLE is reading WorkList item [%lu]: %s %d\n" + , __func__, resp->getAddr() + request_offset, wl.to_string(), value); if (value < wl.temp_prop){ //update prop with temp_prop wl.temp_prop = value; @@ -93,6 +97,10 @@ BaseWLEngine::processNextWLReduceEvent() PacketPtr writePkt = getWritePacket(resp->getAddr(), 64, respData, requestorId); + DPRINTF(MPU, "%s: Sending a pkt with this info. " + "pkt->addr: %lu, pkt->size: %lu\npkt->data: %s\n", + __func__, writePkt->getAddr(), + writePkt->getSize(), writePkt->printData()); if (!memPortBlocked()) { if (sendWLNotif(resp->getAddr() + request_offset)) { sendMemReq(writePkt); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index a84ed2d52f..03f74f1019 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -54,18 +54,19 @@ WLEngine::startup() //FIXME: This is the current version of our initializer. // This should be updated in the future. WorkListItem vertices [5] = { - {1000, 1000, 3, 0}, // Addr: 0 - {1000, 1000, 1, 3}, // Addr: 16 - {1000, 1000, 1, 4}, // Addr: 32 - {10000, 1000, 0, 5}, // Addr: 48 - {10000, 10000, 0, 5} // Addr: 64 + {10000, 10000, 3, 0}, // Addr: 0 + {10000, 10000, 1, 3}, // Addr: 16 + {10000, 10000, 1, 4}, // Addr: 32 + {10000, 10000, 1, 5}, // Addr: 48 + {10000, 10000, 0, 6} // Addr: 64 }; - Edge edges [6] = { + Edge edges [7] = { {0, 16}, // Addr: 1048576 {0, 32}, // Addr: 1048592 {0, 48}, // Addr: 1048608 {0, 32}, // Addr: 1048624 - {0, 64} // Addr: 1048640 + {0, 64}, // Addr: 1048640 + {0, 32} }; for (int i = 0; i < 5; i++) { @@ -75,7 +76,7 @@ WLEngine::startup() sendMemFunctional(pkt); } - for (int i = 0; i < 6; i++) { + for (int i = 0; i < 7; i++) { uint8_t* data = edgeToMemory(edges[i]); PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge), 16, data, 0); diff --git a/src/mem/packet.cc b/src/mem/packet.cc index 31dc330cab..da45246e49 100644 --- a/src/mem/packet.cc +++ b/src/mem/packet.cc @@ -532,4 +532,43 @@ Packet::getHtmTransactionUid() const return htmTransactionUid; } +std::string +Packet::printData() +{ + char ret[1024]; + if (isWrite()) { + uint8_t* data = getPtr(); + std::sprintf(ret,"\n" + "V[%lu] temp_prop: %u, prop: %u, " + "degree: %u, edgeIndex: %u.\n" + "V[%lu] temp_prop: %u, prop: %u, " + "degree: %u, edgeIndex: %u.\n" + "V[%lu] temp_prop: %u, prop: %u, " + "degree: %u, edgeIndex: %u.\n" + "V[%lu] temp_prop: %u, prop: %u, " + "degree: %u, edgeIndex: %u.\n", + getAddr(), + *((uint32_t*) data), + *((uint32_t*) (data + 4)), + *((uint32_t*) (data + 8)), + *((uint32_t*) (data + 12)), + getAddr() + 16, + *((uint32_t*) (data + 16)), + *((uint32_t*) (data + 20)), + *((uint32_t*) (data + 24)), + *((uint32_t*) (data + 28)), + getAddr() + 32, + *((uint32_t*) (data + 32)), + *((uint32_t*) (data + 36)), + *((uint32_t*) (data + 40)), + *((uint32_t*) (data + 44)), + getAddr() + 48, + *((uint32_t*) (data + 48)), + *((uint32_t*) (data + 52)), + *((uint32_t*) (data + 56)), + *((uint32_t*) (data + 60))); + } + return ret; +} + } // namespace gem5 diff --git a/src/mem/packet.hh b/src/mem/packet.hh index a67abbbbaa..8803eacced 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -1374,6 +1374,8 @@ class Packet : public Printable template void setRaw(T v); + std::string printData(); + public: /** * Check a functional request against a memory value stored in From b1a59999867d57af5d5083da4f3044ee785f6ad7 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 22 Mar 2022 01:24:54 -0700 Subject: [PATCH 069/247] Adding coalescer to the code. --- src/accl/graph/base/BaseReadEngine.py | 39 ++++ src/accl/graph/base/BaseReduceEngine.py | 38 ++++ src/accl/graph/base/base_read_engine.cc | 86 ++++++++ src/accl/graph/base/base_read_engine.hh | 101 ++++++++++ src/accl/graph/base/base_reduce_engine.cc | 51 +++++ src/accl/graph/base/base_reduce_engine.hh | 67 +++++++ .../graph/base/{ => old}/BaseApplyEngine.py | 0 src/accl/graph/base/{ => old}/BaseEngine.py | 0 .../graph/base/{ => old}/BasePushEngine.py | 0 src/accl/graph/base/{ => old}/BaseWLEngine.py | 0 .../graph/base/{ => old}/base_apply_engine.cc | 0 .../graph/base/{ => old}/base_apply_engine.hh | 0 src/accl/graph/base/{ => old}/base_engine.cc | 0 src/accl/graph/base/{ => old}/base_engine.hh | 0 .../graph/base/{ => old}/base_push_engine.cc | 0 .../graph/base/{ => old}/base_push_engine.hh | 0 .../graph/base/{ => old}/base_wl_engine.cc | 0 .../graph/base/{ => old}/base_wl_engine.hh | 0 src/accl/graph/sega/coalesce_engine.cc | 187 ++++++++++++++++++ src/accl/graph/sega/coalesce_engine.hh | 88 +++++++++ src/accl/graph/sega/{ => old}/ApplyEngine.py | 0 src/accl/graph/sega/{ => old}/LockDir.py | 0 src/accl/graph/sega/{ => old}/PushEngine.py | 0 src/accl/graph/sega/{ => old}/WLEngine.py | 0 src/accl/graph/sega/{ => old}/apply_engine.cc | 0 src/accl/graph/sega/{ => old}/apply_engine.hh | 0 src/accl/graph/sega/{ => old}/lock_dir.cc | 0 src/accl/graph/sega/{ => old}/lock_dir.hh | 0 src/accl/graph/sega/old/push_engine.cc | 90 +++++++++ src/accl/graph/sega/old/push_engine.hh | 77 ++++++++ src/accl/graph/sega/old/wl_engine.cc | 156 +++++++++++++++ src/accl/graph/sega/old/wl_engine.hh | 86 ++++++++ src/accl/graph/sega/push_engine.cc | 144 +++++++++++++- src/accl/graph/sega/push_engine.hh | 32 ++- src/accl/graph/sega/wl_engine.cc | 109 +++++++--- src/accl/graph/sega/wl_engine.hh | 37 ++-- 36 files changed, 1338 insertions(+), 50 deletions(-) create mode 100644 src/accl/graph/base/BaseReadEngine.py create mode 100644 src/accl/graph/base/BaseReduceEngine.py create mode 100644 src/accl/graph/base/base_read_engine.cc create mode 100644 src/accl/graph/base/base_read_engine.hh create mode 100644 src/accl/graph/base/base_reduce_engine.cc create mode 100644 src/accl/graph/base/base_reduce_engine.hh rename src/accl/graph/base/{ => old}/BaseApplyEngine.py (100%) rename src/accl/graph/base/{ => old}/BaseEngine.py (100%) rename src/accl/graph/base/{ => old}/BasePushEngine.py (100%) rename src/accl/graph/base/{ => old}/BaseWLEngine.py (100%) rename src/accl/graph/base/{ => old}/base_apply_engine.cc (100%) rename src/accl/graph/base/{ => old}/base_apply_engine.hh (100%) rename src/accl/graph/base/{ => old}/base_engine.cc (100%) rename src/accl/graph/base/{ => old}/base_engine.hh (100%) rename src/accl/graph/base/{ => old}/base_push_engine.cc (100%) rename src/accl/graph/base/{ => old}/base_push_engine.hh (100%) rename src/accl/graph/base/{ => old}/base_wl_engine.cc (100%) rename src/accl/graph/base/{ => old}/base_wl_engine.hh (100%) create mode 100644 src/accl/graph/sega/coalesce_engine.cc create mode 100644 src/accl/graph/sega/coalesce_engine.hh rename src/accl/graph/sega/{ => old}/ApplyEngine.py (100%) rename src/accl/graph/sega/{ => old}/LockDir.py (100%) rename src/accl/graph/sega/{ => old}/PushEngine.py (100%) rename src/accl/graph/sega/{ => old}/WLEngine.py (100%) rename src/accl/graph/sega/{ => old}/apply_engine.cc (100%) rename src/accl/graph/sega/{ => old}/apply_engine.hh (100%) rename src/accl/graph/sega/{ => old}/lock_dir.cc (100%) rename src/accl/graph/sega/{ => old}/lock_dir.hh (100%) create mode 100644 src/accl/graph/sega/old/push_engine.cc create mode 100644 src/accl/graph/sega/old/push_engine.hh create mode 100644 src/accl/graph/sega/old/wl_engine.cc create mode 100644 src/accl/graph/sega/old/wl_engine.hh diff --git a/src/accl/graph/base/BaseReadEngine.py b/src/accl/graph/base/BaseReadEngine.py new file mode 100644 index 0000000000..84c53465b9 --- /dev/null +++ b/src/accl/graph/base/BaseReadEngine.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject + +class BaseReadEngine(ClockedObject): + abstract = True + type = 'BaseReadEngine' + cxx_header = "accl/graph/base/base_read_engine.hh" + cxx_class = 'gem5::BaseReadEngine' + + system = Param.System(Parent.any, 'System this Engine is a part of') + mem_port = RequestPort("Port to communicate with the memory") diff --git a/src/accl/graph/base/BaseReduceEngine.py b/src/accl/graph/base/BaseReduceEngine.py new file mode 100644 index 0000000000..0585c36e48 --- /dev/null +++ b/src/accl/graph/base/BaseReduceEngine.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject + +class BaseReduceEngine(ClockedObject): + abstract = True + type = 'BaseReduceEngine' + cxx_header = "accl/graph/base/base_reduce_engine.hh" + cxx_class = 'gem5::BaseReduceEngine' + + system = Param.System(Parent.any, 'System this Engine is a part of') diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc new file mode 100644 index 0000000000..4192cdb565 --- /dev/null +++ b/src/accl/graph/base/base_read_engine.cc @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/base/base_read_engine.hh" + +namespace gem5 +{ + +BaseReadEngine::BaseReadEngine(const BaseReadEngineParams ¶ms): + ClockedObject(params), + system(params.system), + memPort(name() + ".mem_port", this), + _requestorId(system.getRequestorId(this)), +{} + +BaseReadEngine::~BaseReadEngine() +{} + +Port& +BaseReadEngine::getPort(const std::string &if_name, PortID idx) +{ + if (if_name == "mem_port") { + return memPort; + } else { + return SimObject::getPort(if_name, idx); + } +} + +void +BaseReadEngine::MemPort::sendPacket(PacketPtr pkt) +{ + panic_if(_blocked, "Should never try to send if blocked MemSide!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + blockedPacket = pkt; + _blocked = true; + } +} + +bool +BaseReadEngine::MemPort::recvTimingResp(PacketPtr pkt) +{ + //TODO: Investigate sending true all the time + return owner->handleMemResp(pkt); +} + +void +BaseReadEngine::MemPort::recvReqRetry() +{ + panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); + + _blocked = false; + sendPacket(blockedPacket); + + if (!blocked()) { + blockedPacket = nullptr; + } +} + +} diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh new file mode 100644 index 0000000000..99f14bcb06 --- /dev/null +++ b/src/accl/graph/base/base_read_engine.hh @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_BASE_READ_ENGINE_HH__ +#define __ACCL_GRAPH_BASE_READ_ENGINE_HH__ + +#include +#include + +#include "base/addr_range.hh" +#include "mem/packet.hh" +#include "mem/port.hh" +#include "params/BaseEngine.hh" +#include "sim/clocked_object.hh" +#include "sim/system.hh" + +namespace gem5 +{ + +class BaseReadEngine : public ClockedObject +{ + private: + class MemPort : public RequestPort + { + private: + BaseReadEngine* owner; + bool _blocked; + PacketPtr blockedPacket; + + public: + MemPort(const std::string& name, BaseEngine* owner): + RequestPort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} + + void sendPacket(PacketPtr pkt); + bool blocked() { return _blocked; } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + System* system; + MemPort memPort; + + bool handleMemResp(PacketPtr resp); + + protected: + const RequestorID _requestorId; + + bool memPortBlocked() { return memPort.blocked(); } + void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); } + void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } + + virtual bool handleMemResp(PacketPtr pkt) = 0; + + public: + PARAMS(BaseReadEngine); + + BaseReadEngine(const BaseReadEngineParams ¶ms); + ~BaseReadEngine(); + Port& getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; + + RequestorID requestorId() { return _requestorId; } + + AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); } + + void recvFunctional(PacketPtr pkt); + +}; + +} + +#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__ diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc new file mode 100644 index 0000000000..fbfc613313 --- /dev/null +++ b/src/accl/graph/base/base_reduce_engine.cc @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/base/base_reduce_engine.hh" + +namespace gem5 +{ + +BaseReduceEngine::BaseReduceEngine(const BaseReduceEngineParams ¶ms): + ClockedObject(params), + system(params.system), + _requestorId(system->getRequestorId(this)) +{} + +BaseReduceEngine::~BaseReduceEngine() +{} + +void +BaseReduceEngine::handleIncomingWL(Addr addr, WorkListItem wl) +{ + currentWorkListAddress = addr; + currentWorkList = wl; + scheduleReduceEvent(); +} + +} diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh new file mode 100644 index 0000000000..e44f384f26 --- /dev/null +++ b/src/accl/graph/base/base_reduce_engine.hh @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_BASE_REDUCE_ENGINE_HH__ +#define __ACCL_GRAPH_BASE_REDUCE_ENGINE_HH__ + + +#include "accl/base/util.hh" +#include "params/BaseReduceEngine.hh" +#include "sim/clocked_object.hh" +#include "sim/system.hh" + +namespace gem5 +{ + +class BaseReduceEngine : public ClockedObject +{ + private: + System* system; + + bool handleIncomingWL(Addr addr, WorkListItem wl); + + protected: + Addr currentWorkListAddress; + WorkListItem currentWorkList; + + const RequestorID _requestorId; + + virtual void scheduleReduceEvent() = 0; + + public: + PARAMS(BaseReduceEngine); + + BaseReduceEngine(const BaseReduceEngineParams ¶ms); + ~BaseReduceEngine(); + + RequestorID requestorId() { return _requestorId; } +}; + +} + +#endif // __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__ diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/old/BaseApplyEngine.py similarity index 100% rename from src/accl/graph/base/BaseApplyEngine.py rename to src/accl/graph/base/old/BaseApplyEngine.py diff --git a/src/accl/graph/base/BaseEngine.py b/src/accl/graph/base/old/BaseEngine.py similarity index 100% rename from src/accl/graph/base/BaseEngine.py rename to src/accl/graph/base/old/BaseEngine.py diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/old/BasePushEngine.py similarity index 100% rename from src/accl/graph/base/BasePushEngine.py rename to src/accl/graph/base/old/BasePushEngine.py diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/old/BaseWLEngine.py similarity index 100% rename from src/accl/graph/base/BaseWLEngine.py rename to src/accl/graph/base/old/BaseWLEngine.py diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/old/base_apply_engine.cc similarity index 100% rename from src/accl/graph/base/base_apply_engine.cc rename to src/accl/graph/base/old/base_apply_engine.cc diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/old/base_apply_engine.hh similarity index 100% rename from src/accl/graph/base/base_apply_engine.hh rename to src/accl/graph/base/old/base_apply_engine.hh diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/old/base_engine.cc similarity index 100% rename from src/accl/graph/base/base_engine.cc rename to src/accl/graph/base/old/base_engine.cc diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/old/base_engine.hh similarity index 100% rename from src/accl/graph/base/base_engine.hh rename to src/accl/graph/base/old/base_engine.hh diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/old/base_push_engine.cc similarity index 100% rename from src/accl/graph/base/base_push_engine.cc rename to src/accl/graph/base/old/base_push_engine.cc diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/old/base_push_engine.hh similarity index 100% rename from src/accl/graph/base/base_push_engine.hh rename to src/accl/graph/base/old/base_push_engine.hh diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/old/base_wl_engine.cc similarity index 100% rename from src/accl/graph/base/base_wl_engine.cc rename to src/accl/graph/base/old/base_wl_engine.cc diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/old/base_wl_engine.hh similarity index 100% rename from src/accl/graph/base/base_wl_engine.hh rename to src/accl/graph/base/old/base_wl_engine.hh diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc new file mode 100644 index 0000000000..1f7a94dc7e --- /dev/null +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/sega/coalesce_engine.hh" + +#include "accl/sega/wl_engine.hh" + +namespace gem5 +{ + +CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): + BaseReadEngine(params), + reqQueueSize(params.req_queue_size), + conflictAddrQueueSize(params.conflict_addr_queue_size), + nextWorkListSendEvent([this] { processNextWorkListSendEvent(); }, name()), + nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()) +{} + +CoalesceEngine::~CoalesceEngine() +{} + +void +CoalesceEngine::recvFunctional(PacketPtr pkt) +{ + sendMemFunctional(pkt); +} + +void +CoalesceEngine::registerWLEngine(WLEngine* wl_engine) +{ + peerWLEngine = wl_engine; +} + +bool +CoalesceEngine::recvReadAddr(Addr addr) +{ + assert(reqQueue.size() <= reqQueueSize); + if (reqQueue.size() == reqQueueSize) { + return false; + } + + reqQueue.push(addr); + if ((!nextRespondEvent.scheduled()) && (!reqQueue.empty())) { + schedule(nextRespondEvent, nextCycle()); + } + return true; +} + +void +CoalesceEngine::processNextRespondEvent() +{ + // TODO: Investigate this for optimization + Addr addr = reqQueue.front(); + Addr alligned_addr = (addr / 64) * 64; + int block_index = alligned_addr % 256; + int wl_offset = (addr - alligned_addr) / 16; + + if (cacheBlocks[block_index].allocated) { + // Hit + // TODO: I guess this piece of code code could be optimized. + // Not the code per se. The design it represents. + if (cacheBlocks[block_index].addr == alligned_addr) { + if (!cacheBlocks[block_index].taken[wl_offset]) { + if (cacheBlocks[block_index].valid) { + peerWLEngine->handleIncomingWL(addr, + cacheBlocks[block_index].items[wl_offset]); + cacheBlocks[block_index].taken[wl_offset] = true; + } else { + cacheBlocks[block_index].pending[wl_offset] = true; + } + reqQueue.pop(); + } + } else { // conflict + assert(conflictAddrQueue.size() <= conflictAddrQueueSize); + if (conflictAddrQueue.size() < conflictAddrQueueSize) { + cacheBlocks[block_index].numConflicts += 1; + conflictAddrQueue.push(addr); + reqQueue.pop(); + } + } + } else { + // miss + cacheBlocks[block_index].addr = alligned_addr; + cacheBlocks[block_index].numConflicts = 0; + cacheBlocks[block_index].pending = {false, false, false, false}; + cacheBlocks[block_index].pending[wl_offset] = true; + cacheBlocks[block_index].taken = {false, false, false, false}; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].allocated = true; + + PacketPtr pkt = getReadPacket(alligned_addr, 64, _requestorId); + + if (!memPortBlocked()) { + sendMemReq(pkt); + reqQueue.pop(); + } + } + + if ((!nextRespondEvent.scheduled()) && (!reqQueue.empty())) { + schedule(nextRespondEvent, nextCycle()); + } +} + +/* + void recvWLWrite(Addr addr, WorkListItem wl); +*/ + +bool +CoalesceEngine::handleMemResp(PacketPtr pkt) +{ + if (pkt->isResp() && pkt->isWrite()) { + return true; + } + + Addr addr = pkt->getAddr(); + uint8_t data = pkt->getPtr(); + + int block_index = addr % 256; + cacheBlocks[block_index].valid = true; + + for (i = 0; i < 4; i++) { + cacheBlocks[block_index].items[i] = memoryToWorkList(data + (i * 16)); + cacheBlocks[block_index].taken[i] = false; + if (cacheBlocks[block_index].pending[i]) { + peerWLEngine->handleIncomingWL(addr + (i * 16), + cacheBlocks[block_index].items[i]); + cacheBlocks[block_index].taken[i] = true; + } + cacheBlocks[block_index].pending = false; + } +} + +void +CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) +{ + Addr alligned_addr = (addr / 64) * 64; + int block_index = alligned_addr % 256; + int wl_offset = (addr - alligned_addr) / 16; + + assert(cacheBlocks[block_index].taken[wl_offset]); + cacheBlocks[block_index].item[wl_offset] = wl; + cacheBlocks[block_index].taken[wl_offset] = false; + + bool taken_item = false; + taken_item &= (cacheBlocks[block_index].taken[0] & + cacheBlocks[block_index].taken[1] & + cacheBlocks[block_index].taken[2] & + cacheBlocks[block_index].taken[3]); + + if (!taken_item) { + for (auto conflictAddr : conflictAddrQueue) { + int conflict_block_index = ((conflictAddr / 64) * 64) % 256; + if (conflict_block_index == block_index) { + // Evict cacheBlocks[block_index] + // Respond to conflictAddr + } + } + } + +} + +} diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh new file mode 100644 index 0000000000..0b349b2c1a --- /dev/null +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ + +#include "accl/base/base_read_engine.hh" + +namespace gem5 +{ + +class WLEngine; + +class CoalesceEngine : public BaseReadEngine +{ + private: + struct Block + { + WorkListItem items[4]; + Addr addr; + int numConflicts; + bool pending[4]; + bool taken[4]; + bool valid; + bool allocated; + }; + + WLEngine* peerWLEngine; + + Block cacheBlocks[256]; + + int reqQueueSize; + std::queue reqQueue; + + int conflictAddrQueueSize; + std::queue conflictAddrQueue; + + EventFunctionWrapper nextRespondEvent; + void processNextRespondEvent(); + + EventFunctionWrapper nextApplyAndCommitEvent; + void processNextApplyAndCommitEvent(); + + protected: + virtual bool handleMemResp(PacketPtr pkt); + + public: + PARAMS(CoalesceEngine); + + CoalesceEngine(const CoalesceEngineParams ¶ms); + ~CoalesceEngine(); + + void recvFunctional(PacketPtr pkt); + + bool recvReadAddr(Addr addr); + void recvWLWrite(Addr addr, WorkListItem wl); + + void registerWLEngine(WLEngine* wl_engine); +} + +} + +#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/old/ApplyEngine.py similarity index 100% rename from src/accl/graph/sega/ApplyEngine.py rename to src/accl/graph/sega/old/ApplyEngine.py diff --git a/src/accl/graph/sega/LockDir.py b/src/accl/graph/sega/old/LockDir.py similarity index 100% rename from src/accl/graph/sega/LockDir.py rename to src/accl/graph/sega/old/LockDir.py diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/old/PushEngine.py similarity index 100% rename from src/accl/graph/sega/PushEngine.py rename to src/accl/graph/sega/old/PushEngine.py diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/old/WLEngine.py similarity index 100% rename from src/accl/graph/sega/WLEngine.py rename to src/accl/graph/sega/old/WLEngine.py diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/old/apply_engine.cc similarity index 100% rename from src/accl/graph/sega/apply_engine.cc rename to src/accl/graph/sega/old/apply_engine.cc diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/old/apply_engine.hh similarity index 100% rename from src/accl/graph/sega/apply_engine.hh rename to src/accl/graph/sega/old/apply_engine.hh diff --git a/src/accl/graph/sega/lock_dir.cc b/src/accl/graph/sega/old/lock_dir.cc similarity index 100% rename from src/accl/graph/sega/lock_dir.cc rename to src/accl/graph/sega/old/lock_dir.cc diff --git a/src/accl/graph/sega/lock_dir.hh b/src/accl/graph/sega/old/lock_dir.hh similarity index 100% rename from src/accl/graph/sega/lock_dir.hh rename to src/accl/graph/sega/old/lock_dir.hh diff --git a/src/accl/graph/sega/old/push_engine.cc b/src/accl/graph/sega/old/push_engine.cc new file mode 100644 index 0000000000..c7b229ad33 --- /dev/null +++ b/src/accl/graph/sega/old/push_engine.cc @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/push_engine.hh" + +namespace gem5 +{ + +PushEngine::PushEngine(const PushEngineParams ¶ms) : + BasePushEngine(params), + reqPort(name() + "reqPort", this) +{} + +Port& +PushEngine::getPort(const std::string &if_name, PortID idx) +{ + if (if_name == "req_port") { + return reqPort; + } else { + return BasePushEngine::getPort(if_name, idx); + } +} + +void +PushEngine::ReqPort::sendPacket(PacketPtr pkt) +{ + panic_if(_blocked, "Should never try to send if blocked MemSide!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + blockedPacket = pkt; + _blocked = true; + } +} + +bool +PushEngine::ReqPort::recvTimingResp(PacketPtr pkt) +{ + panic("recvTimingResp called on the request port."); +} + +void +PushEngine::ReqPort::recvReqRetry() +{ + panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); + + _blocked = false; + sendPacket(blockedPacket); + + if (!blocked()) { + blockedPacket = nullptr; + } +} + +bool +PushEngine::sendPushUpdate(PacketPtr pkt) +{ + if (!reqPort.blocked()) { + reqPort.sendPacket(pkt); + return true; + } + return false; +} + +} diff --git a/src/accl/graph/sega/old/push_engine.hh b/src/accl/graph/sega/old/push_engine.hh new file mode 100644 index 0000000000..604df4750d --- /dev/null +++ b/src/accl/graph/sega/old/push_engine.hh @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ + +#include "accl/graph/base/base_push_engine.hh" +#include "params/PushEngine.hh" + +namespace gem5 +{ + +class MPU; + +class PushEngine : public BasePushEngine +{ + private: + class ReqPort : public RequestPort + { + private: + PushEngine* owner; + bool _blocked; + PacketPtr blockedPacket; + + public: + ReqPort(const std::string& name, PushEngine* owner) : + RequestPort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} + void sendPacket(PacketPtr pkt); + bool blocked() { return _blocked; } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + ReqPort reqPort; + + protected: + virtual bool sendPushUpdate(PacketPtr pkt) override; + + public: + PARAMS(PushEngine); + PushEngine(const PushEngineParams ¶ms); + Port& getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; +}; + +} + +#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ diff --git a/src/accl/graph/sega/old/wl_engine.cc b/src/accl/graph/sega/old/wl_engine.cc new file mode 100644 index 0000000000..03f74f1019 --- /dev/null +++ b/src/accl/graph/sega/old/wl_engine.cc @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/wl_engine.hh" +#include "debug/MPU.hh" +namespace gem5 +{ + +WLEngine::WLEngine(const WLEngineParams ¶ms): + BaseWLEngine(params), + respPort(name() + ".respPort", this), + applyEngine(params.apply_engine), + lockDir(params.lock_dir) +{} + +Port& +WLEngine::getPort(const std::string &if_name, PortID idx) +{ + if (if_name == "resp_port") { + return respPort; + } else { + return BaseWLEngine::getPort(if_name, idx); + } +} + +void +WLEngine::startup() +{ + //FIXME: This is the current version of our initializer. + // This should be updated in the future. + WorkListItem vertices [5] = { + {10000, 10000, 3, 0}, // Addr: 0 + {10000, 10000, 1, 3}, // Addr: 16 + {10000, 10000, 1, 4}, // Addr: 32 + {10000, 10000, 1, 5}, // Addr: 48 + {10000, 10000, 0, 6} // Addr: 64 + }; + Edge edges [7] = { + {0, 16}, // Addr: 1048576 + {0, 32}, // Addr: 1048592 + {0, 48}, // Addr: 1048608 + {0, 32}, // Addr: 1048624 + {0, 64}, // Addr: 1048640 + {0, 32} + }; + + for (int i = 0; i < 5; i++) { + uint8_t* data = workListToMemory(vertices[i]); + PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem), + 16, data, 0); + sendMemFunctional(pkt); + } + + for (int i = 0; i < 7; i++) { + uint8_t* data = edgeToMemory(edges[i]); + PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge), + 16, data, 0); + sendMemFunctional(pkt); + } + + uint8_t* first_update_data = new uint8_t [4]; + uint32_t* tempPtr = (uint32_t*) first_update_data; + *tempPtr = 0; + + PacketPtr first_update = getUpdatePacket( + 0, 4, first_update_data, requestorId); + + handleWLUpdate(first_update); +} + +bool +WLEngine::sendWLNotif(Addr addr){ + return applyEngine->recvWLNotif(addr); +} + +AddrRangeList +WLEngine::RespPort::getAddrRanges() const +{ + return owner->getAddrRanges(); +} + +bool +WLEngine::RespPort::recvTimingReq(PacketPtr pkt) +{ + return owner->handleWLUpdate(pkt); +} + +Tick +WLEngine::RespPort::recvAtomic(PacketPtr pkt) +{ + panic("recvAtomic unimpl."); +} + +void +WLEngine::RespPort::recvFunctional(PacketPtr pkt) +{ + owner->recvFunctional(pkt); +} + +void +WLEngine::RespPort::recvRespRetry() +{ + panic("recvRespRetry from response port is called."); +} + +void +WLEngine::recvFunctional(PacketPtr pkt) +{ + // FIXME: This needs to be fixed + // if (pkt->cmd == MemCmd::UpdateWL) { + // panic("Functional requests should not be made to WL."); + // //TODO: Might be a good idea to implement later. + // // wlEngine->recvFunctional(pkt); + // } else { + sendMemFunctional(pkt); + // } +} + +bool +WLEngine::acquireAddress(Addr addr) +{ + return lockDir->acquire(addr, requestorId); +} + +bool +WLEngine::releaseAddress(Addr addr) +{ + return lockDir->release(addr, requestorId); +} + +} diff --git a/src/accl/graph/sega/old/wl_engine.hh b/src/accl/graph/sega/old/wl_engine.hh new file mode 100644 index 0000000000..4e8a25795a --- /dev/null +++ b/src/accl/graph/sega/old/wl_engine.hh @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ + +#include +#include + +#include "accl/graph/base/base_wl_engine.hh" +#include "accl/graph/sega/apply_engine.hh" +#include "accl/graph/sega/lock_dir.hh" +#include "params/WLEngine.hh" + +namespace gem5 +{ + +class ApplyEngine; + +class WLEngine : public BaseWLEngine +{ + private: + class RespPort : public ResponsePort + { + private: + WLEngine* owner; + + public: + RespPort(const std::string& name, WLEngine* owner): + ResponsePort(name, owner), owner(owner) + {} + virtual AddrRangeList getAddrRanges() const; + + protected: + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt); + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRespRetry(); + }; + + RespPort respPort; + ApplyEngine* applyEngine; + LockDirectory* lockDir; + + virtual void startup(); + void recvFunctional(PacketPtr pkt); + + protected: + virtual bool sendWLNotif(Addr addr) override; + virtual bool acquireAddress(Addr addr) override; + virtual bool releaseAddress(Addr addr) override; + + public: + PARAMS(WLEngine); + WLEngine(const WLEngineParams ¶ms); + Port& getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; +}; + +} +#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index c7b229ad33..c865451999 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -31,9 +31,16 @@ namespace gem5 { -PushEngine::PushEngine(const PushEngineParams ¶ms) : - BasePushEngine(params), - reqPort(name() + "reqPort", this) +PushEngine::PushEngine(const PushEngineParams ¶ms): + BaseReadEngine(params), + reqPort(name() + ".req_port", this), + baseEdgeAddr(params.base_edge_addr), + memRespQueueSize(params.mem_resp_queue_size), + pushReqQueueSize(params.push_req_queue_size), + onTheFlyReadReqs(0), + nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()), + nextReadEvent([this] { processNextReadEvent(); }, name()), + nextPushEvent([this] { processNextPushEvent(); }, name()) {} Port& @@ -41,8 +48,10 @@ PushEngine::getPort(const std::string &if_name, PortID idx) { if (if_name == "req_port") { return reqPort; + } else if (if_name == "mem_port") { + return BaseReadEngine::getPort(if_name, idx); } else { - return BasePushEngine::getPort(if_name, idx); + return SimObject::getPort(if_name, idx); } } @@ -78,13 +87,130 @@ PushEngine::ReqPort::recvReqRetry() } bool -PushEngine::sendPushUpdate(PacketPtr pkt) +PushEngine::recvWLItem(WorkListItem wl); { - if (!reqPort.blocked()) { - reqPort.sendPacket(pkt); - return true; + assert(pushReqQueue.size() <= pushReqQueueSize); + if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) { + return false; + } + pushReqQueue.push(wl); + + if ((!nextAddrGenEvent.scheduled()) && + (!pushReqQueue.empty())) { + schedule(nextAddrGenEvent, nextCycle()); + } + return true; +} + +void +PushEngine::processNextAddrGenEvent() +{ + WorkListItem wl = pushReqQueue.front(); + + std::vector addr_queue; + std::vector offset_queue; + std::vector num_edge_queue; + + for (uint32_t index = 0; index < wl.degree; index++) { + Addr edge_addr = baseEdgeAddr + (wl.edgeIndex + index) * sizeof(Edge); + Addr req_addr = (edge_addr / 64) * 64; + Addr req_offset = edge_addr % 64; + if (addr_queue.size()) { + if (addr_queue.back() == req_addr) { + num_edge_queue.back()++; + } + else { + addr_queue.push_back(req_addr); + offset_queue.push_back(req_offset); + num_edge_queue.push_back(1); + } + } + else { + addr_queue.push_back(req_addr); + offset_queue.push_back(req_offset); + num_edge_queue.push_back(1); + } + }; + + for (int index = 0; index < addr_queue.size(); index++) { + PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId); + reqOffsetMap[pkt->req] = offset_queue[index]; + reqNumEdgeMap[pkt->req] = num_edge_queue[index]; + reqValueMap[pkt->req] = wl.prop; + pendingReadReqs.push(pkt); + } + + pushReadReqs.pop(); + + if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) { + schedule(nextAddrGenEvent, nextCycle()); + } + + if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) { + schedule(nextReadEvent, nextCycle()); + } +} + +void +PushEngine::processNextReadEvent() +{ + if (((memRespQueue.size() + onTheFlyReadReqs) <= memRespQueueSize) && + (!memPortBlocked())) { + PacketPtr pkt = pendingReadReqs.front(); + sendMemReq(pkt); + onTheFlyReadReqs++; + pendingReadReqs.pop(); + } + + if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) { + schedule(nextReadEvent, nextCycle()); + } +} + +bool +PushEngine::handleMemResp(PacketPtr pkt) +{ + onTheFlyReadReqs--; + memRespQueue.push(pkt); + + if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) { + schedule(nextPushEvent, nextCycle()); + } +} + +void +PushEngine::processNextPushEvent() +{ + PacketPtr pkt = memRespQueue.front(); + RequestPtr req = pkt->req; + uint8_t *data = pkt->getPtr(); + + Addr offset = reqOffsetMap[req]; + int num_edges = reqNumEdgeMap[req]; + uint32_t value = reqValueMap[req]; + + int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t); + for (int i = 0; i < num_edges; i++) { + uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes); + Edge e = memoryToEdge(curr_edge_data); + int data_size = sizeof(uint32_t) / sizeof(uint8_t); + uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]); + // TODO: Implement propagate function here + *update_data = value + 1; + PacketPtr update = getUpdatePacket(e.neighbor, + sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data, + requestorId); + if (sendPushUpdate(update) && (i == num_edges - 1)) { + memRespQueue.pop(); + DPRINTF(MPU, "%s: Reading %s, updating with %d\n" + , __func__, e.to_string(), *update_data); + // TODO: Erase map entries here. + } + } + + if (!nextPushEvent.scheduled() && !memRespQueue.empty()) { + schedule(nextPushEvent, nextCycle()); } - return false; } } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 604df4750d..bf645eb119 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -29,15 +29,13 @@ #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ -#include "accl/graph/base/base_push_engine.hh" +#include "accl/graph/base/base_read_engine.hh" #include "params/PushEngine.hh" namespace gem5 { -class MPU; - -class PushEngine : public BasePushEngine +class PushEngine : public BaseReadEngine { private: class ReqPort : public RequestPort @@ -62,14 +60,38 @@ class PushEngine : public BasePushEngine ReqPort reqPort; + Addr baseEdgeAddr; + + int pushReqQueueSize; + std::queue pushReqQueue; + + // TODO: Possibility of infinite queueing + std::queue pendingReadReqs; + + int memRespQueueSize; + int onTheFlyReadReqs; + std::queue memRespQueue; + + EventFunctionWrapper nextAddrGenEvent; + void processNextAddrGenEvent(); + + EventFunctionWrapper nextReadEvent; + void processNextReadEvent(); + + EventFunctionWrapper nextPushEvent; + void processNextPushEvent(); + protected: - virtual bool sendPushUpdate(PacketPtr pkt) override; + virtual bool handleMemResp(PacketPtr pkt); public: PARAMS(PushEngine); PushEngine(const PushEngineParams ¶ms); + Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; + + bool recvWLItem(WorkListItem wl); }; } diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 03f74f1019..f0c522ff6f 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -28,15 +28,22 @@ #include "accl/graph/sega/wl_engine.hh" #include "debug/MPU.hh" + namespace gem5 { WLEngine::WLEngine(const WLEngineParams ¶ms): - BaseWLEngine(params), - respPort(name() + ".respPort", this), - applyEngine(params.apply_engine), - lockDir(params.lock_dir) -{} + BaseReduceEngine(params), + respPort(name() + ".resp_port", this), + blockedByCoalescer(false), + coaleseEngine(params.coalesce_engine), + updateQueueSize(params.update_queue_size), + onTheFlyUpdateMapSize(params.on_the_fly_update_map_size), + nextReadEvent([this]{ processNextReadEvent(); }, name()), + nextReduceEvent([this]{ processNextReduceEvent(); }, name()) +{ + coaleseEngine->registerWLEngine(this); +} Port& WLEngine::getPort(const std::string &if_name, PortID idx) @@ -44,7 +51,7 @@ WLEngine::getPort(const std::string &if_name, PortID idx) if (if_name == "resp_port") { return respPort; } else { - return BaseWLEngine::getPort(if_name, idx); + return BaseReduceEngine::getPort(if_name, idx); } } @@ -53,6 +60,8 @@ WLEngine::startup() { //FIXME: This is the current version of our initializer. // This should be updated in the future. + //FIXME: The WLEngine no longer has a MemPort. Update this to + // work with the CoalesceEngine instead. WorkListItem vertices [5] = { {10000, 10000, 3, 0}, // Addr: 0 {10000, 10000, 1, 3}, // Addr: 16 @@ -93,11 +102,6 @@ WLEngine::startup() handleWLUpdate(first_update); } -bool -WLEngine::sendWLNotif(Addr addr){ - return applyEngine->recvWLNotif(addr); -} - AddrRangeList WLEngine::RespPort::getAddrRanges() const { @@ -107,7 +111,7 @@ WLEngine::RespPort::getAddrRanges() const bool WLEngine::RespPort::recvTimingReq(PacketPtr pkt) { - return owner->handleWLUpdate(pkt); + return owner->handleIncomingUpdate(pkt); } Tick @@ -131,26 +135,81 @@ WLEngine::RespPort::recvRespRetry() void WLEngine::recvFunctional(PacketPtr pkt) { - // FIXME: This needs to be fixed - // if (pkt->cmd == MemCmd::UpdateWL) { - // panic("Functional requests should not be made to WL."); - // //TODO: Might be a good idea to implement later. - // // wlEngine->recvFunctional(pkt); - // } else { - sendMemFunctional(pkt); - // } + coaleseEngine->recvFunctional(pkt); } -bool -WLEngine::acquireAddress(Addr addr) +AddrRangeList +WLEngine::getAddrRanges() { - return lockDir->acquire(addr, requestorId); + return coaleseEngine->getAddrRanges(); +} + +void +WLEngine::processNextReadEvent() +{ + PacketPtr update = updateQueue.front(); + Addr update_addr = update->getAddr(); + uint32_t update_value = update->getPtr(); + + if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) && + (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) { + if (coalesceEngine->recvReadAddr(update_addr)) { + onTheFlyUpdateMap[update_addr] = update_value + updateQueue.pop(); + } + } else { + // TODO: Generalize this to reduce function rather than just min + onTheFlyUpdateMap[update_addr] = + min(update_addr, onTheFlyUpdateMap[update_addr]); + updateQueue.pop(); + // TODO: Add a stat to count the number of coalescions + } + + if ((!nextReadEvent.scheduled()) && + ((!updateQueue.empty()) || + (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize))) { + schedule(nextReadEvent, nextCycle()); + } +} + +void +WLEngine::processNextReduceEvent() +{ + // TODO: Generalize this to reduce function rather than just min + currentWorkList.temp_prop = min(onTheFlyUpdateMap[currentWorkListAddress], + currentWorkList.temp_prop); + // TODO: Add a delay here + coalesceEngine->recvWLWrite(currentWorkListAddress, currentWorkList); + + onTheFlyUpdateMap.erase(currentWorkListAddress); + currentWorkListAddress = 0; + currentWorkList = {0, 0, 0, 0}; +} + +void +WLEngine::scheduleReduceEvent() +{ + // TODO: Add checks to see if scheduling is necessary or correct. + if (!nextReduceEvent.scheduled()) { + schedule(nextReduceEvent, nextCycle()); + } } bool -WLEngine::releaseAddress(Addr addr) +WLEngine::handleIncomingUpdate(PacketPtr pkt) { - return lockDir->release(addr, requestorId); + // TODO: Coalesce updates here too + assert(updateQueue.size() <= updateQueueSize); + if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) { + return false; + } + + updateQueue.push(pkt); + if ((!nextReadEvent.scheduled()) && + (!updateQueue.empty())) { + schedule(nextReadEvent, nextCycle()); + } + return true; } } diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 4e8a25795a..1846825951 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -32,17 +32,14 @@ #include #include -#include "accl/graph/base/base_wl_engine.hh" -#include "accl/graph/sega/apply_engine.hh" -#include "accl/graph/sega/lock_dir.hh" +#include "accl/graph/base/base_reduce_engine.hh" +#include "accl/graph/sega/coalesce_engine.hh" #include "params/WLEngine.hh" namespace gem5 { -class ApplyEngine; - -class WLEngine : public BaseWLEngine +class WLEngine : public BaseReduceEngine { private: class RespPort : public ResponsePort @@ -64,22 +61,40 @@ class WLEngine : public BaseWLEngine }; RespPort respPort; - ApplyEngine* applyEngine; - LockDirectory* lockDir; + + bool blockedByCoalescer; + CoalesceEngine* coaleseEngine; + + int updateQueueSize; + std::queue updateQueue; + + int onTheFlyUpdateMapSize; + std::unordered_map onTheFlyUpdateMap; virtual void startup(); + void recvFunctional(PacketPtr pkt); + AddrRangeList getAddrRanges() const; + + EventFunctionWrapper nextReadEvent; + void processNextReadEvent(); + + EventFunctionWrapper nextReduceEvent; + void processNextReduceEvent(); + protected: - virtual bool sendWLNotif(Addr addr) override; - virtual bool acquireAddress(Addr addr) override; - virtual bool releaseAddress(Addr addr) override; + virtual void scheduleReduceEvent() = 0; public: PARAMS(WLEngine); + WLEngine(const WLEngineParams ¶ms); + Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; + + bool handleIncomingUpdate(PacketPtr pkt); }; } From 4cc59dc9487d376ee1185cabad60a7ead7b1b564 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Tue, 22 Mar 2022 16:01:55 -0700 Subject: [PATCH 070/247] Finalizing source code. Before compile. --- src/accl/graph/base/SConscript | 12 +- src/accl/graph/sega/CoalesceEngine.py | 40 ++++ src/accl/graph/sega/PushEngine.py | 40 ++++ src/accl/graph/sega/SConscript | 8 +- src/accl/graph/sega/WLEngine.py | 40 ++++ src/accl/graph/sega/coalesce_engine.cc | 306 ++++++++++++++++++------- src/accl/graph/sega/coalesce_engine.hh | 30 ++- 7 files changed, 377 insertions(+), 99 deletions(-) create mode 100644 src/accl/graph/sega/CoalesceEngine.py create mode 100644 src/accl/graph/sega/PushEngine.py create mode 100644 src/accl/graph/sega/WLEngine.py diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript index c5c8c4e901..c6a78eb5e8 100644 --- a/src/accl/graph/base/SConscript +++ b/src/accl/graph/base/SConscript @@ -27,15 +27,11 @@ Import('*') -SimObject('BaseApplyEngine.py') -SimObject('BaseEngine.py') -SimObject('BasePushEngine.py') -SimObject('BaseWLEngine.py') +SimObject('BaseReadEngine.py') +SimObject('BaseReduceEngine.py') -Source('base_apply_engine.cc') -Source('base_engine.cc') -Source('base_push_engine.cc') -Source('base_wl_engine.cc') +Source('base_read_engine.cc') +Source('base_reduce_engine.cc') Source('util.cc') DebugFlag('MPU') diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py new file mode 100644 index 0000000000..0330da7576 --- /dev/null +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.BaseReadEngine import BaseReadEngine + +class CoalesceEngine(BaseReadEngine): + type = 'CoalesceEngine' + cxx_header = "accl/graph/sega/coalesce_engine.hh" + cxx_class = 'gem5::CoalesceEngine' + + peer_push_engine = Param.PushEngine(NULL, "") + num_mshr_entry = Param.Int(4, "") + num_tgts_per_mshr = Param.Int(20, "") + outstanding_mem_req_queue_size = Param.Int(20, "") diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py new file mode 100644 index 0000000000..9036b4e401 --- /dev/null +++ b/src/accl/graph/sega/PushEngine.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.BaseReadEngine import BaseReadEngine + +class PushEngine(BaseReadEngine): + type = 'PushEngine' + cxx_header = "accl/graph/sega/push_engine.hh" + cxx_class = 'gem5::PushEngine' + + req_port = RequestPort("Port to send updates to the outside") + base_edge_addr = Param.Addr() + mem_resp_queue_size = Param.Int(0, "") + push_req_queue_size = Param.Int(0, "") diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index e6d2f1fbbc..9b4629838b 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -27,12 +27,12 @@ Import('*') -SimObject('ApplyEngine.py') -SimObject('LockDir.py') +SimObject('CoalesceEngine.py') SimObject('PushEngine.py') SimObject('WLEngine.py') -Source('apply_engine.cc') -Source('lock_dir.cc') +Source('coalesce_engine.cc') Source('push_engine.cc') Source('wl_engine.cc') + +DebugFlag('MPU') diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py new file mode 100644 index 0000000000..ec9154b138 --- /dev/null +++ b/src/accl/graph/sega/WLEngine.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.BaseReduceEngine import BaseReduceEngine + +class WLEngine(BaseReduceEngine): + type = 'WLEngine' + cxx_header = "accl/graph/sega/wl_engine.hh" + cxx_class = 'gem5::WLEngine' + + resp_port = ResponsePort("Port to Receive updates from outside") + coalesce_engine = Param.CoaleseEngine(NULL, "") + update_queue_size = Param.Int(0, "") + on_the_fly_update_map_size = Param.Int(4, "") # 4 is arbitrary diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 1f7a94dc7e..22bc0d49a6 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -29,14 +29,17 @@ #include "accl/sega/coalesce_engine.hh" #include "accl/sega/wl_engine.hh" +#include "debug/MPU.hh" namespace gem5 { CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): BaseReadEngine(params), - reqQueueSize(params.req_queue_size), - conflictAddrQueueSize(params.conflict_addr_queue_size), + peerPushEngine(params.peer_push_engine), + numMSHREntry(params.num_mshr_entry), + numTgtsPerMSHR(params.num_tgts_per_mshr), + outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size), nextWorkListSendEvent([this] { processNextWorkListSendEvent(); }, name()), nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()) {} @@ -59,69 +62,100 @@ CoalesceEngine::registerWLEngine(WLEngine* wl_engine) bool CoalesceEngine::recvReadAddr(Addr addr) { - assert(reqQueue.size() <= reqQueueSize); - if (reqQueue.size() == reqQueueSize) { - return false; - } - - reqQueue.push(addr); - if ((!nextRespondEvent.scheduled()) && (!reqQueue.empty())) { - schedule(nextRespondEvent, nextCycle()); - } - return true; -} - -void -CoalesceEngine::processNextRespondEvent() -{ - // TODO: Investigate this for optimization - Addr addr = reqQueue.front(); + assert(MSHRMap.size() <= numMSHREntry); Addr alligned_addr = (addr / 64) * 64; int block_index = alligned_addr % 256; int wl_offset = (addr - alligned_addr) / 16; - if (cacheBlocks[block_index].allocated) { + if ((cacheBlocks[block_index].addr == alligned_addr) && + (cacheBlocks[block_index].valid)) { // Hit - // TODO: I guess this piece of code code could be optimized. - // Not the code per se. The design it represents. - if (cacheBlocks[block_index].addr == alligned_addr) { - if (!cacheBlocks[block_index].taken[wl_offset]) { - if (cacheBlocks[block_index].valid) { - peerWLEngine->handleIncomingWL(addr, - cacheBlocks[block_index].items[wl_offset]); - cacheBlocks[block_index].taken[wl_offset] = true; - } else { - cacheBlocks[block_index].pending[wl_offset] = true; - } - reqQueue.pop(); - } - } else { // conflict - assert(conflictAddrQueue.size() <= conflictAddrQueueSize); - if (conflictAddrQueue.size() < conflictAddrQueueSize) { - cacheBlocks[block_index].numConflicts += 1; - conflictAddrQueue.push(addr); - reqQueue.pop(); - } + addrResponseQueue.push(addr); + worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]); + cacheBlocks[block_index].takenMask |= (1 << wl_offset); + if ((!nextRespondEvent.scheduled()) && + (!worklistResponseQueue.empty()) && + (!addrResponseQueue.empty())) { + schedule(nextRespondEvent, nextCycle()); } + return true; } else { // miss - cacheBlocks[block_index].addr = alligned_addr; - cacheBlocks[block_index].numConflicts = 0; - cacheBlocks[block_index].pending = {false, false, false, false}; - cacheBlocks[block_index].pending[wl_offset] = true; - cacheBlocks[block_index].taken = {false, false, false, false}; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].allocated = true; - - PacketPtr pkt = getReadPacket(alligned_addr, 64, _requestorId); - - if (!memPortBlocked()) { - sendMemReq(pkt); - reqQueue.pop(); + if (MSHRMap.find(block_index) == MSHRMap.end()) { + if (MSHRMap.size() == numMSHREntry) { + // Out of MSHR entries + return false; + } else { + if (cacheBlock[block_index].allocated) { + assert(MSHRMap[block_index].size() <= numTgtsPerMSHR) + if (MSHRMap[block_index].size() == numTgtsPerMSHR) { + return false; + } + // MSHR available but conflict + cacheBlocks[block_index].hasConflict = true; + MSHRMap[block_index].push_back(addr); + return true; + } else { + // MSHR available and no conflict + assert( + outstandingMemReqQueue.size() <= + outstandingMemReqQueueSize); + if (outstandingMemReqQueue.size() == + outstandingMemReqQueueSize) { + return false; + } + cacheBlocks[block_index].addr = alligned_addr; + cacheBlocks[block_index].takenMask = 0; + cacheBlocks[block_index].allocated = true; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].hasConflict = false; + + MSHRMap[block_index].push_back(addr); + PacketPtr pkt = getReadPacket(alligned_addr, + 64, _requestorId); + outstandingMemReqQueue.push(pkt); + + if ((!nextMemReqEvent.scheduled()) && + (!outstandingMemReqQueue.empty())) { + schedule(nextMemReqEvent, nextCycle()); + } + return true; + } + } } + } +} + +void +CoalesceEngine::processNextMemReqEvent() +{ + PacketPtr pkt = outstandingMemReqQueue.front(); + + if (!memPortBlocked()) { + sendMemReq(pkt); + outstandingMemReqQueue.pop(); + } + + if ((!nextMemReqEvent.scheduled()) && + (!outstandingMemReqQueue.empty())) { + schedule(nextMemReqEvent, nextCycle()); } +} + +void +CoalesceEngine::processNextRespondEvent() +{ + Addr addr_response = addrResponseQueue.front(); + WorkListItem worklist_response = worklistResponseQueue.front(); + + peerWLEngine->handleIncomingWL(addr_response, worklist_response); - if ((!nextRespondEvent.scheduled()) && (!reqQueue.empty())) { + addrResponseQueue.pop(); + worklistResponseQueue.pop(); + + if ((!nextRespondEvent.scheduled()) && + (!worklistResponseQueue.empty()) && + (!addrResponseQueue.empty())) { schedule(nextRespondEvent, nextCycle()); } } @@ -139,19 +173,50 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) Addr addr = pkt->getAddr(); uint8_t data = pkt->getPtr(); - int block_index = addr % 256; + + assert((cacheBlocks[block_index].allocated) && // allocated cache block + (!cacheBlocks[block_index].valid) && // valid is false + (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR cacheBlocks[block_index].valid = true; - for (i = 0; i < 4; i++) { + for (int i = 0; i < 4; i++) { cacheBlocks[block_index].items[i] = memoryToWorkList(data + (i * 16)); - cacheBlocks[block_index].taken[i] = false; - if (cacheBlocks[block_index].pending[i]) { - peerWLEngine->handleIncomingWL(addr + (i * 16), - cacheBlocks[block_index].items[i]); - cacheBlocks[block_index].taken[i] = true; + } + + int bias = 0; + std::vector servicedIndices; + for (int i = 0; i < MSHRMap[block_index].size(); i++) { + Addr miss_addr = MSHRMap[block_index][i]; + Addr alligned_miss_addr = (miss_addr / 64) * 64; + + if (alligned_miss_addr == addr) { + int wl_offset = (miss_addr - alligned_miss_addr) / 16; + addrResponseQueue.push(miss_addr); + worklistResponseQueue.push( + cacheBlocks[block_index].items[wl_offset]); + cacheBlocks[block_index].takenMask |= (1 << wl_offset); + servicedIndices.push_back(i); } - cacheBlocks[block_index].pending = false; + } + // TODO: We Can use taken instead of this + for (int i = 0; i < servicedIndices.size(); i++) { + MSHRMap[block_index].erase(MSHRMap[block_index].begin() + + servicedIndices[i] - bias); + bias++; + } + + if (MSHRMap[block_index].empty()) { + MSHRMap.erase(block_index); + cacheBlocks[block_index].hasConflict = false; + } else { + cacheBlocks[block_index].hasConflict = true; + } + + if ((!nextRespondEvent.scheduled()) && + (!worklistResponseQueue.empty()) && + (!addrResponseQueue.empty())) { + schedule(nextRespondEvent, nextCycle()); } } @@ -162,26 +227,111 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) int block_index = alligned_addr % 256; int wl_offset = (addr - alligned_addr) / 16; - assert(cacheBlocks[block_index].taken[wl_offset]); + assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) == + (1 << wl_offset)); cacheBlocks[block_index].item[wl_offset] = wl; - cacheBlocks[block_index].taken[wl_offset] = false; - - bool taken_item = false; - taken_item &= (cacheBlocks[block_index].taken[0] & - cacheBlocks[block_index].taken[1] & - cacheBlocks[block_index].taken[2] & - cacheBlocks[block_index].taken[3]); - - if (!taken_item) { - for (auto conflictAddr : conflictAddrQueue) { - int conflict_block_index = ((conflictAddr / 64) * 64) % 256; - if (conflict_block_index == block_index) { - // Evict cacheBlocks[block_index] - // Respond to conflictAddr - } + cacheBlocks[block_index].takenMask &= ~(1 << wl_offset); + + // TODO: Make this more general and programmable. + // && (cacheBlocks[block_index].hasConflict) + if ((cacheBlocks[block_index].takenMask == 0)) { + evictQueue.push(block_index); + } + + if ((!nextApplyAndCommitEvent.scheduled()) && + (!evictQueue.empty())) { + schedule(nextApplyAndCommitEvent, nextCycle()); + } + +} + +void +CoalesceEngine::processNextApplyAndCommitEvent() +{ + int block_index = evictQueue.front(); + uint8_t changedMask = 0; + uint8_t data[64]; + + for (int i = 0; i < 4; i++) { + uint32_t old_prop = cacheBlocks[block_index].items[i].prop; + cacheBlocks[block_index].items[i].prop = std::min( + cacheBlocks[block_index].items[i].prop, + cacheBlocks[block_index].items[i].temp_prop); + if (old_prop != cacheBlocks[block_index].items[i].prop) { + changedMask |= (1 << i); } + uint8_t* wl_data = workListToMemory(cacheBlocks[block_index].items[i]); + std::memcpy(data[i * 16], wl_data, sizeof(WorkListItem)); } + if (changed) { + assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize); + PacketPtr write_pkt = getWritePacket( + cacheBlocks[block_index].addr, 64, data, _requestorId); + + if ((cacheBlocks[block_index].hasConflict) && + (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)){ + Addr miss_addr = MSHRMap[block_index][0]; + // TODO: Make sure this trick works; + Addr alligned_miss_addr = (miss_addr / 64) * 64; + PacketPtr read_pkt = getReadPacket( + alligned_miss_addr, 64, _requestorId); + outstandingMemReqQueue.push(write_pkt); + outstandingMemReqQueue.push(read_pkt); + // TODO: This should be improved + if ((changedMask & (1)) == 1) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]); + } + if ((changedMask & (2)) == 2) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]); + } + if ((changedMask & (4)) == 4) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]); + } + if ((changedMask & (8)) == 8) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]); + } + cacheBlocks[block_index].takenMask = 0; + cacheBlocks[block_index].allocated = true; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].hasConflict = true; + evictQueue.pop(); + } else if ((!cacheBlocks[block_index].hasConflict) && + (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) { + outstandingMemReqQueue.push(write_pkt); + // TODO: This should be improved + if ((changedMask & (1)) == 1) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]); + } + if ((changedMask & (2)) == 2) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]); + } + if ((changedMask & (4)) == 4) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]); + } + if ((changedMask & (8)) == 8) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]); + } + cacheBlocks[block_index].takenMask = 0; + cacheBlocks[block_index].allocated = false; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].hasConflict = false; + evictQueue.pop(); + } else { + DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" , + __func__); + } + } + + if ((!nextMemReqEvent.scheduled()) && + (!outstandingMemReqQueue.empty())) { + schedule(nextMemReqEvent, nextCycle()); + } + + if ((!nextApplyAndCommitEvent.scheduled()) && + (!evictQueue.empty())) { + schedule(nextApplyAndCommitEvent, nextCycle()); + } } } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 0b349b2c1a..f5fd85e4cf 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -30,6 +30,7 @@ #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ #include "accl/base/base_read_engine.hh" +#include "accl/sega/push_engine.hh" namespace gem5 { @@ -43,22 +44,33 @@ class CoalesceEngine : public BaseReadEngine { WorkListItem items[4]; Addr addr; - int numConflicts; - bool pending[4]; - bool taken[4]; - bool valid; + uint8_t takenMask; bool allocated; + bool valid; + bool hasConflict; + // TODO: This might be useful in the future + // Tick lastWLWriteTick; }; WLEngine* peerWLEngine; - + PushEngine* peerPushEngine; + Block cacheBlocks[256]; - int reqQueueSize; - std::queue reqQueue; + int numMSHREntry; + int numTgtsPerMSHR; + std::unordered_map> MSHRMap; + + int outstandingMemReqQueueSize; + std::queue outstandingMemReqQueue; + + std::queue addrResponseQueue; + std::queue worklistResponseQueue; + + std::queue evictQueue; - int conflictAddrQueueSize; - std::queue conflictAddrQueue; + EventFunctionWrapper nextMemReqEvent; + void processNextMemReqEvent(); EventFunctionWrapper nextRespondEvent; void processNextRespondEvent(); From 965a48e61fc7868cf4dfaa190ca99618f0c51d07 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Tue, 22 Mar 2022 17:31:55 -0700 Subject: [PATCH 071/247] Compiles. --- src/accl/graph/base/SConscript | 2 -- src/accl/graph/base/base_read_engine.cc | 4 +-- src/accl/graph/base/base_read_engine.hh | 11 ++++---- src/accl/graph/base/base_reduce_engine.cc | 2 +- src/accl/graph/base/base_reduce_engine.hh | 9 ++++--- src/accl/graph/base/util.hh | 5 ++++ src/accl/graph/sega/PushEngine.py | 2 +- src/accl/graph/sega/WLEngine.py | 2 +- src/accl/graph/sega/coalesce_engine.cc | 31 ++++++++++++++--------- src/accl/graph/sega/coalesce_engine.hh | 10 +++++--- src/accl/graph/sega/push_engine.cc | 24 ++++++++++++++---- src/accl/graph/sega/push_engine.hh | 7 +++++ src/accl/graph/sega/wl_engine.cc | 29 +++++++++++---------- src/accl/graph/sega/wl_engine.hh | 4 +-- 14 files changed, 88 insertions(+), 54 deletions(-) diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript index c6a78eb5e8..8aefca2185 100644 --- a/src/accl/graph/base/SConscript +++ b/src/accl/graph/base/SConscript @@ -33,5 +33,3 @@ SimObject('BaseReduceEngine.py') Source('base_read_engine.cc') Source('base_reduce_engine.cc') Source('util.cc') - -DebugFlag('MPU') diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc index 4192cdb565..894831429b 100644 --- a/src/accl/graph/base/base_read_engine.cc +++ b/src/accl/graph/base/base_read_engine.cc @@ -26,7 +26,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "accl/base/base_read_engine.hh" +#include "accl/graph/base/base_read_engine.hh" namespace gem5 { @@ -35,7 +35,7 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams ¶ms): ClockedObject(params), system(params.system), memPort(name() + ".mem_port", this), - _requestorId(system.getRequestorId(this)), + _requestorId(system->getRequestorId(this)) {} BaseReadEngine::~BaseReadEngine() diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh index 99f14bcb06..956c50e47d 100644 --- a/src/accl/graph/base/base_read_engine.hh +++ b/src/accl/graph/base/base_read_engine.hh @@ -26,8 +26,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __ACCL_GRAPH_BASE_READ_ENGINE_HH__ -#define __ACCL_GRAPH_BASE_READ_ENGINE_HH__ +#ifndef __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__ +#define __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__ #include #include @@ -35,7 +35,7 @@ #include "base/addr_range.hh" #include "mem/packet.hh" #include "mem/port.hh" -#include "params/BaseEngine.hh" +#include "params/BaseReadEngine.hh" #include "sim/clocked_object.hh" #include "sim/system.hh" @@ -53,7 +53,7 @@ class BaseReadEngine : public ClockedObject PacketPtr blockedPacket; public: - MemPort(const std::string& name, BaseEngine* owner): + MemPort(const std::string& name, BaseReadEngine* owner): RequestPort(name, owner), owner(owner), _blocked(false), blockedPacket(nullptr) {} @@ -69,8 +69,6 @@ class BaseReadEngine : public ClockedObject System* system; MemPort memPort; - bool handleMemResp(PacketPtr resp); - protected: const RequestorID _requestorId; @@ -85,6 +83,7 @@ class BaseReadEngine : public ClockedObject BaseReadEngine(const BaseReadEngineParams ¶ms); ~BaseReadEngine(); + Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc index fbfc613313..82643ba3ff 100644 --- a/src/accl/graph/base/base_reduce_engine.cc +++ b/src/accl/graph/base/base_reduce_engine.cc @@ -26,7 +26,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "accl/base/base_reduce_engine.hh" +#include "accl/graph/base/base_reduce_engine.hh" namespace gem5 { diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh index e44f384f26..7851eaf585 100644 --- a/src/accl/graph/base/base_reduce_engine.hh +++ b/src/accl/graph/base/base_reduce_engine.hh @@ -26,11 +26,11 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __ACCL_GRAPH_BASE_REDUCE_ENGINE_HH__ -#define __ACCL_GRAPH_BASE_REDUCE_ENGINE_HH__ +#ifndef __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__ +#define __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__ -#include "accl/base/util.hh" +#include "accl/graph/base/util.hh" #include "params/BaseReduceEngine.hh" #include "sim/clocked_object.hh" #include "sim/system.hh" @@ -43,7 +43,6 @@ class BaseReduceEngine : public ClockedObject private: System* system; - bool handleIncomingWL(Addr addr, WorkListItem wl); protected: Addr currentWorkListAddress; @@ -60,6 +59,8 @@ class BaseReduceEngine : public ClockedObject ~BaseReduceEngine(); RequestorID requestorId() { return _requestorId; } + + void handleIncomingWL(Addr addr, WorkListItem wl); }; } diff --git a/src/accl/graph/base/util.hh b/src/accl/graph/base/util.hh index a4418a1cb8..1066d37d1c 100644 --- a/src/accl/graph/base/util.hh +++ b/src/accl/graph/base/util.hh @@ -26,6 +26,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#ifndef __ACCL_GRAPH_BASE_UTIL_HH__ +#define __ACCL_GRAPH_BASE_UTIL_HH__ + #include "base/cprintf.hh" #include "base/types.hh" #include "mem/packet.hh" @@ -75,3 +78,5 @@ PacketPtr getUpdatePacket(Addr addr, unsigned int size, uint8_t *data, RequestorID requestorId); } + +#endif // __ACCL_GRAPH_BASE_UTIL_HH__ diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py index 9036b4e401..129d9454c7 100644 --- a/src/accl/graph/sega/PushEngine.py +++ b/src/accl/graph/sega/PushEngine.py @@ -35,6 +35,6 @@ class PushEngine(BaseReadEngine): cxx_class = 'gem5::PushEngine' req_port = RequestPort("Port to send updates to the outside") - base_edge_addr = Param.Addr() + base_edge_addr = Param.Addr("") mem_resp_queue_size = Param.Int(0, "") push_req_queue_size = Param.Int(0, "") diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py index ec9154b138..cab47fbe7b 100644 --- a/src/accl/graph/sega/WLEngine.py +++ b/src/accl/graph/sega/WLEngine.py @@ -35,6 +35,6 @@ class WLEngine(BaseReduceEngine): cxx_class = 'gem5::WLEngine' resp_port = ResponsePort("Port to Receive updates from outside") - coalesce_engine = Param.CoaleseEngine(NULL, "") + coalesce_engine = Param.CoalesceEngine(NULL, "") update_queue_size = Param.Int(0, "") on_the_fly_update_map_size = Param.Int(4, "") # 4 is arbitrary diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 22bc0d49a6..663559cc63 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -26,9 +26,9 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "accl/sega/coalesce_engine.hh" +#include "accl/graph/sega/coalesce_engine.hh" -#include "accl/sega/wl_engine.hh" +#include "accl/graph/sega/wl_engine.hh" #include "debug/MPU.hh" namespace gem5 @@ -40,12 +40,13 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): numMSHREntry(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size), - nextWorkListSendEvent([this] { processNextWorkListSendEvent(); }, name()), + nextMemReqEvent([this] { processNextMemReqEvent(); }, name()), + nextRespondEvent([this] { processNextRespondEvent(); }, name()), nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()) {} -CoalesceEngine::~CoalesceEngine() -{} +// CoalesceEngine::~CoalesceEngine() +// {} void CoalesceEngine::recvFunctional(PacketPtr pkt) @@ -86,8 +87,8 @@ CoalesceEngine::recvReadAddr(Addr addr) // Out of MSHR entries return false; } else { - if (cacheBlock[block_index].allocated) { - assert(MSHRMap[block_index].size() <= numTgtsPerMSHR) + if (cacheBlocks[block_index].allocated) { + assert(MSHRMap[block_index].size() <= numTgtsPerMSHR); if (MSHRMap[block_index].size() == numTgtsPerMSHR) { return false; } @@ -122,6 +123,10 @@ CoalesceEngine::recvReadAddr(Addr addr) return true; } } + } else { + assert(cacheBlocks[block_index].hasConflict); + MSHRMap[block_index].push_back(addr); + return true; } } } @@ -167,12 +172,12 @@ CoalesceEngine::processNextRespondEvent() bool CoalesceEngine::handleMemResp(PacketPtr pkt) { - if (pkt->isResp() && pkt->isWrite()) { + if (pkt->isResponse() && pkt->isWrite()) { return true; } Addr addr = pkt->getAddr(); - uint8_t data = pkt->getPtr(); + uint8_t* data = pkt->getPtr(); int block_index = addr % 256; assert((cacheBlocks[block_index].allocated) && // allocated cache block @@ -218,6 +223,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) (!addrResponseQueue.empty())) { schedule(nextRespondEvent, nextCycle()); } + + return true; } void @@ -229,7 +236,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) == (1 << wl_offset)); - cacheBlocks[block_index].item[wl_offset] = wl; + cacheBlocks[block_index].items[wl_offset] = wl; cacheBlocks[block_index].takenMask &= ~(1 << wl_offset); // TODO: Make this more general and programmable. @@ -261,10 +268,10 @@ CoalesceEngine::processNextApplyAndCommitEvent() changedMask |= (1 << i); } uint8_t* wl_data = workListToMemory(cacheBlocks[block_index].items[i]); - std::memcpy(data[i * 16], wl_data, sizeof(WorkListItem)); + std::memcpy(data + (i * 16), wl_data, sizeof(WorkListItem)); } - if (changed) { + if (changedMask) { assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize); PacketPtr write_pkt = getWritePacket( cacheBlocks[block_index].addr, 64, data, _requestorId); diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index f5fd85e4cf..6086a8855e 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -29,8 +29,10 @@ #ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ -#include "accl/base/base_read_engine.hh" -#include "accl/sega/push_engine.hh" +#include "accl/graph/base/base_read_engine.hh" +#include "accl/graph/base/util.hh" +#include "accl/graph/sega/push_engine.hh" +#include "params/CoalesceEngine.hh" namespace gem5 { @@ -85,7 +87,7 @@ class CoalesceEngine : public BaseReadEngine PARAMS(CoalesceEngine); CoalesceEngine(const CoalesceEngineParams ¶ms); - ~CoalesceEngine(); + // ~CoalesceEngine(); void recvFunctional(PacketPtr pkt); @@ -93,7 +95,7 @@ class CoalesceEngine : public BaseReadEngine void recvWLWrite(Addr addr, WorkListItem wl); void registerWLEngine(WLEngine* wl_engine); -} +}; } diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index c865451999..2a978cfcc5 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -28,6 +28,8 @@ #include "accl/graph/sega/push_engine.hh" +#include "debug/MPU.hh" + namespace gem5 { @@ -35,8 +37,8 @@ PushEngine::PushEngine(const PushEngineParams ¶ms): BaseReadEngine(params), reqPort(name() + ".req_port", this), baseEdgeAddr(params.base_edge_addr), - memRespQueueSize(params.mem_resp_queue_size), pushReqQueueSize(params.push_req_queue_size), + memRespQueueSize(params.mem_resp_queue_size), onTheFlyReadReqs(0), nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()), nextReadEvent([this] { processNextReadEvent(); }, name()), @@ -87,7 +89,7 @@ PushEngine::ReqPort::recvReqRetry() } bool -PushEngine::recvWLItem(WorkListItem wl); +PushEngine::recvWLItem(WorkListItem wl) { assert(pushReqQueue.size() <= pushReqQueueSize); if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) { @@ -133,14 +135,14 @@ PushEngine::processNextAddrGenEvent() }; for (int index = 0; index < addr_queue.size(); index++) { - PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId); + PacketPtr pkt = getReadPacket(addr_queue[index], 64, _requestorId); reqOffsetMap[pkt->req] = offset_queue[index]; reqNumEdgeMap[pkt->req] = num_edge_queue[index]; reqValueMap[pkt->req] = wl.prop; pendingReadReqs.push(pkt); } - pushReadReqs.pop(); + pushReqQueue.pop(); if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) { schedule(nextAddrGenEvent, nextCycle()); @@ -176,6 +178,7 @@ PushEngine::handleMemResp(PacketPtr pkt) if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) { schedule(nextPushEvent, nextCycle()); } + return true; } void @@ -199,7 +202,8 @@ PushEngine::processNextPushEvent() *update_data = value + 1; PacketPtr update = getUpdatePacket(e.neighbor, sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data, - requestorId); + _requestorId); + if (sendPushUpdate(update) && (i == num_edges - 1)) { memRespQueue.pop(); DPRINTF(MPU, "%s: Reading %s, updating with %d\n" @@ -213,4 +217,14 @@ PushEngine::processNextPushEvent() } } +bool +PushEngine::sendPushUpdate(PacketPtr pkt) +{ + if (!reqPort.blocked()) { + reqPort.sendPacket(pkt); + return true; + } + return false; +} + } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index bf645eb119..e97a26c7bd 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -30,6 +30,7 @@ #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ #include "accl/graph/base/base_read_engine.hh" +#include "accl/graph/base/util.hh" #include "params/PushEngine.hh" namespace gem5 @@ -65,6 +66,10 @@ class PushEngine : public BaseReadEngine int pushReqQueueSize; std::queue pushReqQueue; + std::unordered_map reqOffsetMap; + std::unordered_map reqNumEdgeMap; + std::unordered_map reqValueMap; + // TODO: Possibility of infinite queueing std::queue pendingReadReqs; @@ -72,6 +77,8 @@ class PushEngine : public BaseReadEngine int onTheFlyReadReqs; std::queue memRespQueue; + bool sendPushUpdate(PacketPtr pkt); + EventFunctionWrapper nextAddrGenEvent; void processNextAddrGenEvent(); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index f0c522ff6f..43ad112db3 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -36,13 +36,13 @@ WLEngine::WLEngine(const WLEngineParams ¶ms): BaseReduceEngine(params), respPort(name() + ".resp_port", this), blockedByCoalescer(false), - coaleseEngine(params.coalesce_engine), + coalesceEngine(params.coalesce_engine), updateQueueSize(params.update_queue_size), onTheFlyUpdateMapSize(params.on_the_fly_update_map_size), nextReadEvent([this]{ processNextReadEvent(); }, name()), nextReduceEvent([this]{ processNextReduceEvent(); }, name()) { - coaleseEngine->registerWLEngine(this); + coalesceEngine->registerWLEngine(this); } Port& @@ -82,14 +82,14 @@ WLEngine::startup() uint8_t* data = workListToMemory(vertices[i]); PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem), 16, data, 0); - sendMemFunctional(pkt); + coalesceEngine->recvFunctional(pkt); } for (int i = 0; i < 7; i++) { uint8_t* data = edgeToMemory(edges[i]); PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge), 16, data, 0); - sendMemFunctional(pkt); + coalesceEngine->recvFunctional(pkt); } uint8_t* first_update_data = new uint8_t [4]; @@ -97,9 +97,9 @@ WLEngine::startup() *tempPtr = 0; PacketPtr first_update = getUpdatePacket( - 0, 4, first_update_data, requestorId); + 0, 4, first_update_data, _requestorId); - handleWLUpdate(first_update); + handleIncomingUpdate(first_update); } AddrRangeList @@ -135,13 +135,13 @@ WLEngine::RespPort::recvRespRetry() void WLEngine::recvFunctional(PacketPtr pkt) { - coaleseEngine->recvFunctional(pkt); + coalesceEngine->recvFunctional(pkt); } AddrRangeList -WLEngine::getAddrRanges() +WLEngine::getAddrRanges() const { - return coaleseEngine->getAddrRanges(); + return coalesceEngine->getAddrRanges(); } void @@ -149,18 +149,18 @@ WLEngine::processNextReadEvent() { PacketPtr update = updateQueue.front(); Addr update_addr = update->getAddr(); - uint32_t update_value = update->getPtr(); + uint32_t* update_value = update->getPtr(); if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) && (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) { if (coalesceEngine->recvReadAddr(update_addr)) { - onTheFlyUpdateMap[update_addr] = update_value + onTheFlyUpdateMap[update_addr] = *update_value; updateQueue.pop(); } } else { // TODO: Generalize this to reduce function rather than just min onTheFlyUpdateMap[update_addr] = - min(update_addr, onTheFlyUpdateMap[update_addr]); + std::min(*update_value, onTheFlyUpdateMap[update_addr]); updateQueue.pop(); // TODO: Add a stat to count the number of coalescions } @@ -176,8 +176,9 @@ void WLEngine::processNextReduceEvent() { // TODO: Generalize this to reduce function rather than just min - currentWorkList.temp_prop = min(onTheFlyUpdateMap[currentWorkListAddress], - currentWorkList.temp_prop); + currentWorkList.temp_prop = std::min( + onTheFlyUpdateMap[currentWorkListAddress], + currentWorkList.temp_prop); // TODO: Add a delay here coalesceEngine->recvWLWrite(currentWorkListAddress, currentWorkList); diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 1846825951..3ce01dd69d 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -63,7 +63,7 @@ class WLEngine : public BaseReduceEngine RespPort respPort; bool blockedByCoalescer; - CoalesceEngine* coaleseEngine; + CoalesceEngine* coalesceEngine; int updateQueueSize; std::queue updateQueue; @@ -84,7 +84,7 @@ class WLEngine : public BaseReduceEngine void processNextReduceEvent(); protected: - virtual void scheduleReduceEvent() = 0; + virtual void scheduleReduceEvent(); public: PARAMS(WLEngine); From df5706a46ff4b39293a26c4b3c06dc7aee1aa2d5 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 23 Mar 2022 00:34:29 -0700 Subject: [PATCH 072/247] Debugging after compilation. Loop writting to mem --- configs/accl/sega.py | 28 +++++--- src/accl/graph/base/base_reduce_engine.cc | 8 --- src/accl/graph/base/base_reduce_engine.hh | 4 +- src/accl/graph/sega/coalesce_engine.cc | 83 +++++++++++++++-------- src/accl/graph/sega/coalesce_engine.hh | 4 +- src/accl/graph/sega/push_engine.cc | 5 +- src/accl/graph/sega/wl_engine.cc | 60 +++++++++++----- src/accl/graph/sega/wl_engine.hh | 6 +- 8 files changed, 126 insertions(+), 72 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 163ea169d9..f71b0e73e0 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -4,15 +4,12 @@ class MPU(SubSystem): def __init__(self): super(MPU, self).__init__() - self.lock_dir = LockDirectory() - self.push_engine = PushEngine() - self.apply_engine = ApplyEngine(push_engine = self.push_engine, lock_dir = self.lock_dir) - self.wl_engine = WLEngine(apply_engine = self.apply_engine, lock_dir = self.lock_dir) + self.push_engine = PushEngine(base_edge_addr=0x100000, push_req_queue_size = 16) + self.coalesce_engine = CoalesceEngine(peer_push_engine=self.push_engine) + self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, update_queue_size = 16, on_the_fly_update_map_size=8) self.interconnect = SystemXBar() - - self.interconnect.cpu_side_ports = self.wl_engine.mem_port - self.interconnect.cpu_side_ports = self.apply_engine.mem_port + self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port self.interconnect.cpu_side_ports = self.push_engine.mem_port def getRespPort(self): @@ -30,6 +27,16 @@ def getMemPort(self): def setMemPort(self, port): self.interconnect.mem_side_ports = port + def getVertexMemPort(self): + return self.coalesce_engine.mem_port + def setVertexMemPort(self, port): + self.coalesce_engine.mem_port = port + + def getEdgeMemPort(self): + return self.push_engine.mem_port + def setEdgeMemPort(self, port): + self.push_engine.mem_port = port + class SEGA(System): def __init__(self): super(SEGA, self).__init__() @@ -40,8 +47,9 @@ def __init__(self): self.mpu = MPU() self.mem_ctrl = SimpleMemory(range=AddrRange("4GiB"), bandwidth="1000GB/s", latency = "30ns") - # self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB"))) - + # self.mem_ctrl = MemCtrl() + # self.mem_ctrl.dram = DDR4_2400_8x8(range=AddrRange(start=0x000000, size="1MiB")) + # self.mem_ctrl.nvm = NVM_2400_1x64(range=AddrRange(start=0x100000, size="1MiB")) self.mpu.setReqPort(self.mpu.getRespPort()) self.mpu.setMemPort(self.mem_ctrl.port) @@ -50,6 +58,6 @@ def __init__(self): m5.instantiate() -exit_event = m5.simulate(1000000) +exit_event = m5.simulate() print("Simulation finished!") exit() diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc index 82643ba3ff..38a8662ed0 100644 --- a/src/accl/graph/base/base_reduce_engine.cc +++ b/src/accl/graph/base/base_reduce_engine.cc @@ -40,12 +40,4 @@ BaseReduceEngine::BaseReduceEngine(const BaseReduceEngineParams ¶ms): BaseReduceEngine::~BaseReduceEngine() {} -void -BaseReduceEngine::handleIncomingWL(Addr addr, WorkListItem wl) -{ - currentWorkListAddress = addr; - currentWorkList = wl; - scheduleReduceEvent(); -} - } diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh index 7851eaf585..64d6e4c8c0 100644 --- a/src/accl/graph/base/base_reduce_engine.hh +++ b/src/accl/graph/base/base_reduce_engine.hh @@ -50,8 +50,6 @@ class BaseReduceEngine : public ClockedObject const RequestorID _requestorId; - virtual void scheduleReduceEvent() = 0; - public: PARAMS(BaseReduceEngine); @@ -60,7 +58,7 @@ class BaseReduceEngine : public ClockedObject RequestorID requestorId() { return _requestorId; } - void handleIncomingWL(Addr addr, WorkListItem wl); + virtual void handleIncomingWL(Addr addr, WorkListItem wl) = 0; }; } diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 663559cc63..aa6bc99887 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -45,8 +45,16 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()) {} -// CoalesceEngine::~CoalesceEngine() -// {} +void +CoalesceEngine::startup() +{ + for (int i = 0; i < 256; i++) { + cacheBlocks[i].takenMask = 0; + cacheBlocks[i].allocated = false; + cacheBlocks[i].valid = false; + cacheBlocks[i].hasConflict = false; + } +} void CoalesceEngine::recvFunctional(PacketPtr pkt) @@ -64,6 +72,8 @@ bool CoalesceEngine::recvReadAddr(Addr addr) { assert(MSHRMap.size() <= numMSHREntry); + DPRINTF(MPU, "%s: Received a read request for address: %lu.\n", + __func__, addr); Addr alligned_addr = (addr / 64) * 64; int block_index = alligned_addr % 256; int wl_offset = (addr - alligned_addr) / 16; @@ -71,11 +81,13 @@ CoalesceEngine::recvReadAddr(Addr addr) if ((cacheBlocks[block_index].addr == alligned_addr) && (cacheBlocks[block_index].valid)) { // Hit + DPRINTF(MPU, "%s: Read request with addr: %lu hit in the cache.\n" + , __func__, addr); addrResponseQueue.push(addr); worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]); cacheBlocks[block_index].takenMask |= (1 << wl_offset); if ((!nextRespondEvent.scheduled()) && - (!worklistResponseQueue.empty()) && + (!worklistResponseQueue.empty()) && (!addrResponseQueue.empty())) { schedule(nextRespondEvent, nextCycle()); } @@ -93,18 +105,26 @@ CoalesceEngine::recvReadAddr(Addr addr) return false; } // MSHR available but conflict + DPRINTF(MPU, "%s: Read request with addr: %lu missed with " + "conflict. Making a request for " + "alligned_addr: %lu.\n", + __func__, addr, alligned_addr); cacheBlocks[block_index].hasConflict = true; MSHRMap[block_index].push_back(addr); return true; } else { // MSHR available and no conflict assert( - outstandingMemReqQueue.size() <= + outstandingMemReqQueue.size() <= outstandingMemReqQueueSize); - if (outstandingMemReqQueue.size() == + if (outstandingMemReqQueue.size() == outstandingMemReqQueueSize) { return false; } + DPRINTF(MPU, "%s: Read request with addr: " + "%lu missed with no conflict. " + "Making a request for alligned_addr: %lu.\n" + , __func__, addr, alligned_addr); cacheBlocks[block_index].addr = alligned_addr; cacheBlocks[block_index].takenMask = 0; cacheBlocks[block_index].allocated = true; @@ -112,7 +132,7 @@ CoalesceEngine::recvReadAddr(Addr addr) cacheBlocks[block_index].hasConflict = false; MSHRMap[block_index].push_back(addr); - PacketPtr pkt = getReadPacket(alligned_addr, + PacketPtr pkt = getReadPacket(alligned_addr, 64, _requestorId); outstandingMemReqQueue.push(pkt); @@ -124,11 +144,15 @@ CoalesceEngine::recvReadAddr(Addr addr) } } } else { - assert(cacheBlocks[block_index].hasConflict); + if ((!cacheBlocks[block_index].hasConflict) && + ((addr < cacheBlocks[block_index].addr) || + (addr >= (cacheBlocks[block_index].addr + 64)))) { + cacheBlocks[block_index].hasConflict = true; + } MSHRMap[block_index].push_back(addr); return true; } - } + } } void @@ -143,7 +167,7 @@ CoalesceEngine::processNextMemReqEvent() if ((!nextMemReqEvent.scheduled()) && (!outstandingMemReqQueue.empty())) { - schedule(nextMemReqEvent, nextCycle()); + schedule(nextMemReqEvent, nextCycle()); } } @@ -152,23 +176,19 @@ CoalesceEngine::processNextRespondEvent() { Addr addr_response = addrResponseQueue.front(); WorkListItem worklist_response = worklistResponseQueue.front(); - + peerWLEngine->handleIncomingWL(addr_response, worklist_response); addrResponseQueue.pop(); worklistResponseQueue.pop(); if ((!nextRespondEvent.scheduled()) && - (!worklistResponseQueue.empty()) && + (!worklistResponseQueue.empty()) && (!addrResponseQueue.empty())) { schedule(nextRespondEvent, nextCycle()); } } -/* - void recvWLWrite(Addr addr, WorkListItem wl); -*/ - bool CoalesceEngine::handleMemResp(PacketPtr pkt) { @@ -183,11 +203,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) assert((cacheBlocks[block_index].allocated) && // allocated cache block (!cacheBlocks[block_index].valid) && // valid is false (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR - cacheBlocks[block_index].valid = true; for (int i = 0; i < 4; i++) { cacheBlocks[block_index].items[i] = memoryToWorkList(data + (i * 16)); } + cacheBlocks[block_index].valid = true; int bias = 0; std::vector servicedIndices; @@ -201,12 +221,12 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) worklistResponseQueue.push( cacheBlocks[block_index].items[wl_offset]); cacheBlocks[block_index].takenMask |= (1 << wl_offset); - servicedIndices.push_back(i); + servicedIndices.push_back(i); } } // TODO: We Can use taken instead of this for (int i = 0; i < servicedIndices.size(); i++) { - MSHRMap[block_index].erase(MSHRMap[block_index].begin() + + MSHRMap[block_index].erase(MSHRMap[block_index].begin() + servicedIndices[i] - bias); bias++; } @@ -219,7 +239,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) } if ((!nextRespondEvent.scheduled()) && - (!worklistResponseQueue.empty()) && + (!worklistResponseQueue.empty()) && (!addrResponseQueue.empty())) { schedule(nextRespondEvent, nextCycle()); } @@ -233,12 +253,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) Addr alligned_addr = (addr / 64) * 64; int block_index = alligned_addr % 256; int wl_offset = (addr - alligned_addr) / 16; - - assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) == + DPRINTF(MPU, "%s: Recieved a WorkList write. addr: %lu, wl: %s.\n", + __func__, addr, wl.to_string()); + DPRINTF(MPU, "%s: alligned_addr: %lu, block_index: %d, wl_offset: %d, " + "takenMask: %u.\n", __func__, alligned_addr, + block_index, wl_offset, cacheBlocks[block_index].takenMask); + assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) == (1 << wl_offset)); cacheBlocks[block_index].items[wl_offset] = wl; cacheBlocks[block_index].takenMask &= ~(1 << wl_offset); - + // TODO: Make this more general and programmable. // && (cacheBlocks[block_index].hasConflict) if ((cacheBlocks[block_index].takenMask == 0)) { @@ -267,6 +291,9 @@ CoalesceEngine::processNextApplyAndCommitEvent() if (old_prop != cacheBlocks[block_index].items[i].prop) { changedMask |= (1 << i); } + DPRINTF(MPU, "%s: Writing WorkListItem[%lu[%d]] to memory. " + "WLItem: %s.\n", __func__, cacheBlocks[block_index].addr, + i, cacheBlocks[block_index].items[i].to_string()); uint8_t* wl_data = workListToMemory(cacheBlocks[block_index].items[i]); std::memcpy(data + (i * 16), wl_data, sizeof(WorkListItem)); } @@ -275,7 +302,7 @@ CoalesceEngine::processNextApplyAndCommitEvent() assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize); PacketPtr write_pkt = getWritePacket( cacheBlocks[block_index].addr, 64, data, _requestorId); - + if ((cacheBlocks[block_index].hasConflict) && (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)){ Addr miss_addr = MSHRMap[block_index][0]; @@ -304,7 +331,7 @@ CoalesceEngine::processNextApplyAndCommitEvent() cacheBlocks[block_index].hasConflict = true; evictQueue.pop(); } else if ((!cacheBlocks[block_index].hasConflict) && - (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) { + (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) { outstandingMemReqQueue.push(write_pkt); // TODO: This should be improved if ((changedMask & (1)) == 1) { @@ -325,16 +352,16 @@ CoalesceEngine::processNextApplyAndCommitEvent() cacheBlocks[block_index].hasConflict = false; evictQueue.pop(); } else { - DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" , + DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" , __func__); } } - + if ((!nextMemReqEvent.scheduled()) && (!outstandingMemReqQueue.empty())) { - schedule(nextMemReqEvent, nextCycle()); + schedule(nextMemReqEvent, nextCycle()); } - + if ((!nextApplyAndCommitEvent.scheduled()) && (!evictQueue.empty())) { schedule(nextApplyAndCommitEvent, nextCycle()); diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 6086a8855e..6dc7bc1001 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -56,7 +56,7 @@ class CoalesceEngine : public BaseReadEngine WLEngine* peerWLEngine; PushEngine* peerPushEngine; - + Block cacheBlocks[256]; int numMSHREntry; @@ -71,6 +71,8 @@ class CoalesceEngine : public BaseReadEngine std::queue evictQueue; + virtual void startup(); + EventFunctionWrapper nextMemReqEvent; void processNextMemReqEvent(); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 2a978cfcc5..06b5381641 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -95,6 +95,7 @@ PushEngine::recvWLItem(WorkListItem wl) if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) { return false; } + pushReqQueue.push(wl); if ((!nextAddrGenEvent.scheduled()) && @@ -204,10 +205,10 @@ PushEngine::processNextPushEvent() sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data, _requestorId); + DPRINTF(MPU, "%s: Reading %s, updating with %d\n" + , __func__, e.to_string(), *update_data); if (sendPushUpdate(update) && (i == num_edges - 1)) { memRespQueue.pop(); - DPRINTF(MPU, "%s: Reading %s, updating with %d\n" - , __func__, e.to_string(), *update_data); // TODO: Erase map entries here. } } diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 43ad112db3..b7f59987cb 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -154,45 +154,70 @@ WLEngine::processNextReadEvent() if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) && (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) { if (coalesceEngine->recvReadAddr(update_addr)) { + DPRINTF(MPU, "%s: Received an update and it's not been pulled in. " + "update_addr: %lu, update_value: %u.\n", + __func__, update_addr, *update_value); onTheFlyUpdateMap[update_addr] = *update_value; + DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n", + __func__, update_addr, onTheFlyUpdateMap[update_addr]); updateQueue.pop(); + DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size()); } } else { // TODO: Generalize this to reduce function rather than just min + DPRINTF(MPU, "%s: Hitting in the onTheFlyUpdateMap." + "update_addr: %lu, update_value: %u, old_value: %u.\n", + __func__, update_addr, *update_value, + onTheFlyUpdateMap[update_addr]); onTheFlyUpdateMap[update_addr] = std::min(*update_value, onTheFlyUpdateMap[update_addr]); updateQueue.pop(); + DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size()); // TODO: Add a stat to count the number of coalescions } if ((!nextReadEvent.scheduled()) && - ((!updateQueue.empty()) || - (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize))) { + (!updateQueue.empty())) { schedule(nextReadEvent, nextCycle()); } } void -WLEngine::processNextReduceEvent() +WLEngine::handleIncomingWL(Addr addr, WorkListItem wl) { - // TODO: Generalize this to reduce function rather than just min - currentWorkList.temp_prop = std::min( - onTheFlyUpdateMap[currentWorkListAddress], - currentWorkList.temp_prop); - // TODO: Add a delay here - coalesceEngine->recvWLWrite(currentWorkListAddress, currentWorkList); - - onTheFlyUpdateMap.erase(currentWorkListAddress); - currentWorkListAddress = 0; - currentWorkList = {0, 0, 0, 0}; + assert(addrWorkListMap.size() <= onTheFlyUpdateMapSize); + addrWorkListMap[addr] = wl; + // TODO: Add checks to see if scheduling is necessary or correct. + if ((!nextReduceEvent.scheduled()) && (!addrWorkListMap.empty())) { + schedule(nextReduceEvent, nextCycle()); + } } void -WLEngine::scheduleReduceEvent() +WLEngine::processNextReduceEvent() { - // TODO: Add checks to see if scheduling is necessary or correct. - if (!nextReduceEvent.scheduled()) { - schedule(nextReduceEvent, nextCycle()); + + std::unordered_map::iterator it = + addrWorkListMap.begin(); + + std::vector servicedAddresses; + while (it != addrWorkListMap.end()) { + Addr addr = it->first; + WorkListItem wl = it->second; + uint32_t update_value = onTheFlyUpdateMap[addr]; + DPRINTF(MPU, "%s: updating WorkList[%lu] with the current temp_prop: " + "%d, with new update: %d.\n", __func__, addr, wl.temp_prop, + onTheFlyUpdateMap[addr]); + // TODO: Generalize this to reduce function rather than just min + wl.temp_prop = std::min(update_value, wl.temp_prop); + coalesceEngine->recvWLWrite(addr, wl); + servicedAddresses.push_back(addr); + it++; + } + + addrWorkListMap.clear(); + for (int i = 0; i < servicedAddresses.size(); i++) { + onTheFlyUpdateMap.erase(servicedAddresses[i]); } } @@ -206,6 +231,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt) } updateQueue.push(pkt); + DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size()); if ((!nextReadEvent.scheduled()) && (!updateQueue.empty())) { schedule(nextReadEvent, nextCycle()); diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 3ce01dd69d..1ccb13d91e 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -71,6 +71,7 @@ class WLEngine : public BaseReduceEngine int onTheFlyUpdateMapSize; std::unordered_map onTheFlyUpdateMap; + std::unordered_map addrWorkListMap; virtual void startup(); void recvFunctional(PacketPtr pkt); @@ -83,9 +84,6 @@ class WLEngine : public BaseReduceEngine EventFunctionWrapper nextReduceEvent; void processNextReduceEvent(); - protected: - virtual void scheduleReduceEvent(); - public: PARAMS(WLEngine); @@ -95,6 +93,8 @@ class WLEngine : public BaseReduceEngine PortID idx=InvalidPortID) override; bool handleIncomingUpdate(PacketPtr pkt); + + virtual void handleIncomingWL(Addr addr, WorkListItem wl); }; } From ca2f0692bf3cf8fcd4b4459e1b352c6d795b95b0 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 23 Mar 2022 00:51:48 -0700 Subject: [PATCH 073/247] Correctness tested with small graph. --- src/accl/graph/sega/coalesce_engine.cc | 23 ++++++++++++++++++++--- src/accl/graph/sega/coalesce_engine.hh | 2 +- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index aa6bc99887..62062116c2 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -265,8 +265,19 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) // TODO: Make this more general and programmable. // && (cacheBlocks[block_index].hasConflict) + bool found = false; if ((cacheBlocks[block_index].takenMask == 0)) { - evictQueue.push(block_index); + for (auto index : evictQueue) { + if (block_index == index) { + found = true; + break; + } + } + if (!found) { + evictQueue.push_back(block_index); + } + DPRINTF(MPU, "%s: evictQueue.size: %u.\n", + __func__, evictQueue.size()); } if ((!nextApplyAndCommitEvent.scheduled()) && @@ -329,7 +340,9 @@ CoalesceEngine::processNextApplyAndCommitEvent() cacheBlocks[block_index].allocated = true; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = true; - evictQueue.pop(); + evictQueue.pop_front(); + DPRINTF(MPU, "%s: evictQueue.size: %u.\n", + __func__, evictQueue.size()); } else if ((!cacheBlocks[block_index].hasConflict) && (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) { outstandingMemReqQueue.push(write_pkt); @@ -350,11 +363,15 @@ CoalesceEngine::processNextApplyAndCommitEvent() cacheBlocks[block_index].allocated = false; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = false; - evictQueue.pop(); + evictQueue.pop_front(); + DPRINTF(MPU, "%s: evictQueue.size: %u.\n", + __func__, evictQueue.size()); } else { DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" , __func__); } + } else { + evictQueue.pop_front(); } if ((!nextMemReqEvent.scheduled()) && diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 6dc7bc1001..3290f646f4 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -69,7 +69,7 @@ class CoalesceEngine : public BaseReadEngine std::queue addrResponseQueue; std::queue worklistResponseQueue; - std::queue evictQueue; + std::deque evictQueue; virtual void startup(); From 358c8e6e9e0a59f7a5a3d6f780e47b559d3e524e Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Wed, 23 Mar 2022 09:53:26 -0700 Subject: [PATCH 074/247] Added performance statistics. --- src/accl/graph/sega/coalesce_engine.cc | 32 +++++++++++++++++++++++++- src/accl/graph/sega/coalesce_engine.hh | 18 +++++++++++++++ src/accl/graph/sega/wl_engine.cc | 22 +++++++++++++++++- src/accl/graph/sega/wl_engine.hh | 15 ++++++++++++ 4 files changed, 85 insertions(+), 2 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 62062116c2..d58a36188e 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -42,7 +42,8 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size), nextMemReqEvent([this] { processNextMemReqEvent(); }, name()), nextRespondEvent([this] { processNextRespondEvent(); }, name()), - nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()) + nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()), + stats(*this) {} void @@ -86,6 +87,8 @@ CoalesceEngine::recvReadAddr(Addr addr) addrResponseQueue.push(addr); worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]); cacheBlocks[block_index].takenMask |= (1 << wl_offset); + stats.readHits++; + stats.numVertexReads++; if ((!nextRespondEvent.scheduled()) && (!worklistResponseQueue.empty()) && (!addrResponseQueue.empty())) { @@ -138,6 +141,7 @@ CoalesceEngine::recvReadAddr(Addr addr) if ((!nextMemReqEvent.scheduled()) && (!outstandingMemReqQueue.empty())) { + stats.numVertexBlockReads++; schedule(nextMemReqEvent, nextCycle()); } return true; @@ -221,6 +225,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) worklistResponseQueue.push( cacheBlocks[block_index].items[wl_offset]); cacheBlocks[block_index].takenMask |= (1 << wl_offset); + stats.numVertexReads++; servicedIndices.push_back(i); } } @@ -262,6 +267,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) (1 << wl_offset)); cacheBlocks[block_index].items[wl_offset] = wl; cacheBlocks[block_index].takenMask &= ~(1 << wl_offset); + stats.numVertexWrites++; // TODO: Make this more general and programmable. // && (cacheBlocks[block_index].hasConflict) @@ -376,6 +382,7 @@ CoalesceEngine::processNextApplyAndCommitEvent() if ((!nextMemReqEvent.scheduled()) && (!outstandingMemReqQueue.empty())) { + stats.numVertexBlockWrites++; schedule(nextMemReqEvent, nextCycle()); } @@ -385,4 +392,27 @@ CoalesceEngine::processNextApplyAndCommitEvent() } } +CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) + : statistics::Group(&_coalesce), + coalesce(_coalesce), + + ADD_STAT(numVertexBlockReads, statistics::units::Count::get(), + "Number of memory blocks read for vertecies"), + ADD_STAT(numVertexBlockWrites, statistics::units::Count::get(), + "Number of memory blocks writes for vertecies"), + ADD_STAT(numVertexReads, statistics::units::Count::get(), + "Number of memory vertecies read from cache."), + ADD_STAT(numVertexWrites, statistics::units::Count::get(), + "Number of memory vertecies written to cache."), + ADD_STAT(readHits, statistics::units::Count::get(), + "Number of cache hits.") +{ +} + +void +CoalesceEngine::CoalesceStats::regStats() +{ + using namespace statistics; +} + } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 3290f646f4..d45fffa3aa 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -32,6 +32,7 @@ #include "accl/graph/base/base_read_engine.hh" #include "accl/graph/base/util.hh" #include "accl/graph/sega/push_engine.hh" +#include "base/statistics.hh" #include "params/CoalesceEngine.hh" namespace gem5 @@ -82,6 +83,23 @@ class CoalesceEngine : public BaseReadEngine EventFunctionWrapper nextApplyAndCommitEvent; void processNextApplyAndCommitEvent(); + struct CoalesceStats : public statistics::Group + { + CoalesceStats(CoalesceEngine &coalesce); + + void regStats() override; + + CoalesceEngine &coalesce; + + statistics::Scalar numVertexBlockReads; + statistics::Scalar numVertexBlockWrites; + statistics::Scalar numVertexReads; + statistics::Scalar numVertexWrites; + statistics::Scalar readHits; + }; + + CoalesceStats stats; + protected: virtual bool handleMemResp(PacketPtr pkt); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index b7f59987cb..517d10ef67 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -40,7 +40,8 @@ WLEngine::WLEngine(const WLEngineParams ¶ms): updateQueueSize(params.update_queue_size), onTheFlyUpdateMapSize(params.on_the_fly_update_map_size), nextReadEvent([this]{ processNextReadEvent(); }, name()), - nextReduceEvent([this]{ processNextReduceEvent(); }, name()) + nextReduceEvent([this]{ processNextReduceEvent(); }, name()), + stats(*this) { coalesceEngine->registerWLEngine(this); } @@ -171,6 +172,7 @@ WLEngine::processNextReadEvent() onTheFlyUpdateMap[update_addr]); onTheFlyUpdateMap[update_addr] = std::min(*update_value, onTheFlyUpdateMap[update_addr]); + stats.onTheFlyCoalesce++; updateQueue.pop(); DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size()); // TODO: Add a stat to count the number of coalescions @@ -209,6 +211,7 @@ WLEngine::processNextReduceEvent() "%d, with new update: %d.\n", __func__, addr, wl.temp_prop, onTheFlyUpdateMap[addr]); // TODO: Generalize this to reduce function rather than just min + stats.numReduce++; wl.temp_prop = std::min(update_value, wl.temp_prop); coalesceEngine->recvWLWrite(addr, wl); servicedAddresses.push_back(addr); @@ -239,4 +242,21 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt) return true; } +WLEngine::WorkListStats::WorkListStats(WLEngine &_wl) + : statistics::Group(&_wl), + wl(_wl), + + ADD_STAT(numReduce, statistics::units::Count::get(), + "Number of memory blocks read for vertecies"), + ADD_STAT(onTheFlyCoalesce, statistics::units::Count::get(), + "Number of memory blocks read for vertecies") +{ +} + +void +WLEngine::WorkListStats::regStats() +{ + using namespace statistics; +} + } diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 1ccb13d91e..891916e7af 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -34,6 +34,7 @@ #include "accl/graph/base/base_reduce_engine.hh" #include "accl/graph/sega/coalesce_engine.hh" +#include "base/statistics.hh" #include "params/WLEngine.hh" namespace gem5 @@ -84,6 +85,20 @@ class WLEngine : public BaseReduceEngine EventFunctionWrapper nextReduceEvent; void processNextReduceEvent(); + struct WorkListStats : public statistics::Group + { + WorkListStats(WLEngine &worklist); + + void regStats() override; + + WLEngine &wl; + + statistics::Scalar numReduce; + statistics::Scalar onTheFlyCoalesce; + }; + + WorkListStats stats; + public: PARAMS(WLEngine); From c6ae6a6c93f0527d83044d4b207a9507a779a1b3 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 31 Mar 2022 14:10:40 -0700 Subject: [PATCH 075/247] Updating definitions for structs and removing unnecessary funcs. --- configs/accl/sega.py | 50 +++++--- src/accl/graph/base/base_read_engine.cc | 15 +++ src/accl/graph/base/base_read_engine.hh | 4 +- src/accl/graph/base/base_reduce_engine.hh | 2 - src/accl/graph/base/util.cc | 145 ---------------------- src/accl/graph/base/util.hh | 54 ++++---- src/accl/graph/sega/coalesce_engine.cc | 98 ++++++++++----- src/accl/graph/sega/coalesce_engine.hh | 12 +- src/accl/graph/sega/push_engine.cc | 42 ++++++- src/accl/graph/sega/push_engine.hh | 4 + src/accl/graph/sega/wl_engine.cc | 59 ++------- src/accl/graph/sega/wl_engine.hh | 1 - 12 files changed, 201 insertions(+), 285 deletions(-) delete mode 100644 src/accl/graph/base/util.cc diff --git a/configs/accl/sega.py b/configs/accl/sega.py index f71b0e73e0..8ea247106e 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -4,9 +4,13 @@ class MPU(SubSystem): def __init__(self): super(MPU, self).__init__() - self.push_engine = PushEngine(base_edge_addr=0x100000, push_req_queue_size = 16) - self.coalesce_engine = CoalesceEngine(peer_push_engine=self.push_engine) - self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, update_queue_size = 16, on_the_fly_update_map_size=8) + self.push_engine = PushEngine(base_edge_addr=0x100000, + push_req_queue_size = 16) + self.coalesce_engine = CoalesceEngine( + peer_push_engine=self.push_engine) + self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, + update_queue_size = 16, + on_the_fly_update_map_size=8) self.interconnect = SystemXBar() self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port @@ -27,31 +31,41 @@ def getMemPort(self): def setMemPort(self, port): self.interconnect.mem_side_ports = port - def getVertexMemPort(self): - return self.coalesce_engine.mem_port - def setVertexMemPort(self, port): - self.coalesce_engine.mem_port = port +class MPUMemory(SubSystem): + def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary): + super(MPUMemory, self).__init__() + self.vertex_mem_ctrl = SimpleMemory( + range=vertex_range, bandwidth="25GB/s", + latency="30ns", image_file=vertex_binary) + self.edge_mem_ctrl = SimpleMemory( + range=edge_range, bandwidth="25GB/s", + latency="30ns", image_file=edge_binary) + self.interconnect = SystemXBar() + + self.interconnect.mem_side_ports = self.vertex_mem_ctrl.port + self.interconnect.mem_side_ports = self.edge_mem_ctrl.port - def getEdgeMemPort(self): - return self.push_engine.mem_port - def setEdgeMemPort(self, port): - self.push_engine.mem_port = port + def getPort(self): + return self.interconnect.cpu_side_ports + def setPort(self, port): + self.interconnect.cpu_side_ports = port class SEGA(System): def __init__(self): super(SEGA, self).__init__() - self.clk_domain = SrcClockDomain() - self.clk_domain.clock = '2GHz' + self.clk_domain.clock = '1GHz' self.clk_domain.voltage_domain = VoltageDomain() self.mpu = MPU() - self.mem_ctrl = SimpleMemory(range=AddrRange("4GiB"), bandwidth="1000GB/s", latency = "30ns") - # self.mem_ctrl = MemCtrl() - # self.mem_ctrl.dram = DDR4_2400_8x8(range=AddrRange(start=0x000000, size="1MiB")) - # self.mem_ctrl.nvm = NVM_2400_1x64(range=AddrRange(start=0x100000, size="1MiB")) + self.mem_ctrl = MPUMemory( + vertex_range=AddrRange(start=0x000000, size="2GiB"), + vertex_binary="live-journal/graph_binaries/vertices", + edge_range=AddrRange(start=0x80000000, size="2GiB"), + edge_binary="live-journal/graph_binaries/edgelist_0") + self.mpu.setReqPort(self.mpu.getRespPort()) - self.mpu.setMemPort(self.mem_ctrl.port) + self.mpu.setMemPort(self.mem_ctrl.getPort()) system = SEGA() root = Root(full_system = False, system = system) diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc index 894831429b..a32237db35 100644 --- a/src/accl/graph/base/base_read_engine.cc +++ b/src/accl/graph/base/base_read_engine.cc @@ -83,4 +83,19 @@ BaseReadEngine::MemPort::recvReqRetry() } } +PacketPtr +BaseReadEngine::createReadPacket(Addr addr, unsigned int size) +{ + RequestPtr req = std::make_shared(addr, size, 0, _requestorId); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) _requestorId) << 2); + + // Embed it in a packet + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + pkt->allocate(); + + return pkt; +} + } diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh index 956c50e47d..591b51aeb7 100644 --- a/src/accl/graph/base/base_read_engine.hh +++ b/src/accl/graph/base/base_read_engine.hh @@ -78,12 +78,14 @@ class BaseReadEngine : public ClockedObject virtual bool handleMemResp(PacketPtr pkt) = 0; + PacketPtr createReadPacket(Addr addr, unsigned int size); + public: PARAMS(BaseReadEngine); BaseReadEngine(const BaseReadEngineParams ¶ms); ~BaseReadEngine(); - + Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh index 64d6e4c8c0..f2245f571f 100644 --- a/src/accl/graph/base/base_reduce_engine.hh +++ b/src/accl/graph/base/base_reduce_engine.hh @@ -45,8 +45,6 @@ class BaseReduceEngine : public ClockedObject protected: - Addr currentWorkListAddress; - WorkListItem currentWorkList; const RequestorID _requestorId; diff --git a/src/accl/graph/base/util.cc b/src/accl/graph/base/util.cc deleted file mode 100644 index 4172607ed0..0000000000 --- a/src/accl/graph/base/util.cc +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2021 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "accl/graph/base/util.hh" - -namespace gem5 -{ - -WorkListItem -memoryToWorkList(uint8_t* data){ - WorkListItem wl; - - uint32_t temp_prop = *((uint32_t*) data); - uint32_t prop = *((uint32_t*) (data + 4)); - uint32_t degree = *((uint32_t*) (data + 8)); - uint32_t addr = *((uint32_t*) (data + 12)); - - wl = {temp_prop, prop, degree, addr}; - return wl; -} - -uint8_t* -workListToMemory(WorkListItem wl){ - int data_size = sizeof(WorkListItem) / sizeof(uint8_t); - uint8_t* data = new uint8_t [data_size]; - - uint32_t* tempPtr = (uint32_t*) data; - *tempPtr = wl.temp_prop; - - uint32_t* propPtr = (uint32_t*) (data + 4); - *propPtr = wl.prop; - - uint32_t* degreePtr = (uint32_t*) (data + 8); - *degreePtr = wl.degree; - - uint32_t* edgePtr = (uint32_t*) (data + 12); - *edgePtr = wl.edgeIndex; - - return data; -} - -// Edge: (weight: 64 bits, neighbor: 64 bits) -Edge -memoryToEdge(uint8_t *data) -{ - uint64_t weight = *((uint64_t*) data); - Addr neighbor = *((Addr*) (data + 8)); // data + 8 because weight: 8 bytes - Edge e = {weight, neighbor}; - return e; -} - -// Edge: (weight: 64 bits, neighbor: 64 bits) -uint8_t* -edgeToMemory(Edge e) -{ - int data_size = (int) ((sizeof(Edge)) / (sizeof(uint8_t))); - - uint8_t* data = new uint8_t [data_size]; - - uint64_t* weightPtr = (uint64_t*) data; - *weightPtr = e.weight; - - Addr* neighborPtr = (Addr*) (data + 8); // data + 8 because weight: 8 bytes - *neighborPtr = e.neighbor; - - return data; -} - -PacketPtr -getReadPacket(Addr addr, unsigned int size, RequestorID requestorId) -{ - RequestPtr req = std::make_shared(addr, size, 0, requestorId); - // Dummy PC to have PC-based prefetchers latch on; get entropy into higher - // bits - req->setPC(((Addr)requestorId) << 2); - - // Embed it in a packet - PacketPtr pkt = new Packet(req, MemCmd::ReadReq); - pkt->allocate(); - - return pkt; -} - -PacketPtr -getWritePacket(Addr addr, unsigned int size, - uint8_t* data, RequestorID requestorId) -{ - RequestPtr req = std::make_shared(addr, size, 0, - requestorId); - // Dummy PC to have PC-based prefetchers latch on; get entropy into higher - // bits - req->setPC(((Addr)requestorId) << 2); - - PacketPtr pkt = new Packet(req, MemCmd::WriteReq); - pkt->allocate(); - pkt->setData(data); - - return pkt; -} - -PacketPtr -getUpdatePacket(Addr addr, unsigned int size, - uint8_t *data, RequestorID requestorId) -{ - RequestPtr req = std::make_shared(addr, size, 0, - requestorId); - // Dummy PC to have PC-based prefetchers latch on; get entropy into higher - // bits - req->setPC(((Addr)requestorId) << 2); - - // FIXME: MemCmd::UpdateWL - PacketPtr pkt = new Packet(req, MemCmd::ReadReq); - - pkt->allocate(); - pkt->setData(data); - - return pkt; -} - -} diff --git a/src/accl/graph/base/util.hh b/src/accl/graph/base/util.hh index 1066d37d1c..b51a9f0781 100644 --- a/src/accl/graph/base/util.hh +++ b/src/accl/graph/base/util.hh @@ -30,52 +30,56 @@ #define __ACCL_GRAPH_BASE_UTIL_HH__ #include "base/cprintf.hh" -#include "base/types.hh" -#include "mem/packet.hh" -#include "mem/request.hh" namespace gem5 { -struct WorkListItem +struct __attribute__ ((packed)) WorkListItem { - uint32_t temp_prop; - uint32_t prop; - uint32_t degree; - uint32_t edgeIndex; + uint32_t tempProp : 32; + uint32_t prop : 32; + uint32_t degree : 32; + uint32_t edgeIndex : 32; std::string to_string() { return csprintf( "WorkListItem{temp_prop: %u, prop: %u, degree: %u, edgeIndex: %u}", - temp_prop, prop, degree, edgeIndex); + tempProp, prop, degree, edgeIndex); } + WorkListItem(): + tempProp(0), + prop(0), + degree(0), + edgeIndex(0) + {} + + WorkListItem(uint32_t temp_prop, uint32_t prop, + uint32_t degree, uint32_t edge_index): + tempProp(temp_prop), + prop(prop), + degree(degree), + edgeIndex(edge_index) + {} + }; -struct Edge +struct __attribute__ ((packed)) Edge { - uint64_t weight; - Addr neighbor; + uint16_t weight : 16; + uint64_t neighbor : 48; std::string to_string() { return csprintf("Edge{weight: %lu, neighbor: %lu}", weight, neighbor); } -}; -WorkListItem memoryToWorkList(uint8_t* data); -uint8_t* workListToMemory(WorkListItem wl); - -Edge memoryToEdge(uint8_t* data); -uint8_t* edgeToMemory(Edge e); - -PacketPtr getReadPacket(Addr addr, unsigned int size, - RequestorID requestorId); -PacketPtr getWritePacket(Addr addr, unsigned int size, - uint8_t* data, RequestorID requestorId); -PacketPtr getUpdatePacket(Addr addr, unsigned int size, - uint8_t *data, RequestorID requestorId); + Edge(uint16_t weight, uint64_t neighbor): + weight(weight), + neighbor(neighbor) + {} +}; } diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index d58a36188e..67874cb9b9 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -75,29 +75,33 @@ CoalesceEngine::recvReadAddr(Addr addr) assert(MSHRMap.size() <= numMSHREntry); DPRINTF(MPU, "%s: Received a read request for address: %lu.\n", __func__, addr); - Addr alligned_addr = (addr / 64) * 64; - int block_index = alligned_addr % 256; - int wl_offset = (addr - alligned_addr) / 16; + Addr aligned_addr = (addr / 64) * 64; + int block_index = aligned_addr % 256; + int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); - if ((cacheBlocks[block_index].addr == alligned_addr) && + if ((cacheBlocks[block_index].addr == aligned_addr) && (cacheBlocks[block_index].valid)) { // Hit DPRINTF(MPU, "%s: Read request with addr: %lu hit in the cache.\n" , __func__, addr); + // TODO: Make addrQueue and wlQueue into one std::pair addrResponseQueue.push(addr); worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]); + // TODO: Use a bitset instead of unsigned int for takenMask cacheBlocks[block_index].takenMask |= (1 << wl_offset); + stats.readHits++; stats.numVertexReads++; - if ((!nextRespondEvent.scheduled()) && - (!worklistResponseQueue.empty()) && - (!addrResponseQueue.empty())) { + + assert(!worklistResponseQueue.empty() && !addrResponseQueue.empty()); + if (!nextRespondEvent.scheduled()) { schedule(nextRespondEvent, nextCycle()); } return true; } else { // miss if (MSHRMap.find(block_index) == MSHRMap.end()) { + assert(MSHRMap.size() <= numMSHREntry); if (MSHRMap.size() == numMSHREntry) { // Out of MSHR entries return false; @@ -110,12 +114,14 @@ CoalesceEngine::recvReadAddr(Addr addr) // MSHR available but conflict DPRINTF(MPU, "%s: Read request with addr: %lu missed with " "conflict. Making a request for " - "alligned_addr: %lu.\n", - __func__, addr, alligned_addr); + "aligned_addr: %lu.\n", + __func__, addr, aligned_addr); cacheBlocks[block_index].hasConflict = true; MSHRMap[block_index].push_back(addr); return true; } else { + // TODO: Set valid to false every deallocation and + // assert valid == false here. // MSHR available and no conflict assert( outstandingMemReqQueue.size() <= @@ -126,31 +132,34 @@ CoalesceEngine::recvReadAddr(Addr addr) } DPRINTF(MPU, "%s: Read request with addr: " "%lu missed with no conflict. " - "Making a request for alligned_addr: %lu.\n" - , __func__, addr, alligned_addr); - cacheBlocks[block_index].addr = alligned_addr; + "Making a request for aligned_addr: %lu.\n" + , __func__, addr, aligned_addr); + cacheBlocks[block_index].addr = aligned_addr; cacheBlocks[block_index].takenMask = 0; cacheBlocks[block_index].allocated = true; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = false; MSHRMap[block_index].push_back(addr); - PacketPtr pkt = getReadPacket(alligned_addr, - 64, _requestorId); + // TODO: Parameterize 64 to memory atom size + PacketPtr pkt = createReadPacket(aligned_addr, 64); outstandingMemReqQueue.push(pkt); - if ((!nextMemReqEvent.scheduled()) && - (!outstandingMemReqQueue.empty())) { - stats.numVertexBlockReads++; + stats.numVertexBlockReads++; + + assert(!outstandingMemReqQueue.empty()); + if (!nextMemReqEvent.scheduled()) { schedule(nextMemReqEvent, nextCycle()); } return true; } } } else { + if (MSHRMap[block_index].size() == numTgtsPerMSHR) { + return false; + } if ((!cacheBlocks[block_index].hasConflict) && - ((addr < cacheBlocks[block_index].addr) || - (addr >= (cacheBlocks[block_index].addr + 64)))) { + (aligned_addr != cacheBlocks[block_index].addr)) { cacheBlocks[block_index].hasConflict = true; } MSHRMap[block_index].push_back(addr); @@ -196,20 +205,24 @@ CoalesceEngine::processNextRespondEvent() bool CoalesceEngine::handleMemResp(PacketPtr pkt) { - if (pkt->isResponse() && pkt->isWrite()) { + assert(pkt->isResponse()); + if (pkt->isWrite()) { return true; } Addr addr = pkt->getAddr(); uint8_t* data = pkt->getPtr(); - int block_index = addr % 256; + int block_index = addr % 256; // TODO: After parameterizing the cache size + // this 256 number should change to the cache + // size parameter. assert((cacheBlocks[block_index].allocated) && // allocated cache block (!cacheBlocks[block_index].valid) && // valid is false (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR for (int i = 0; i < 4; i++) { - cacheBlocks[block_index].items[i] = memoryToWorkList(data + (i * 16)); + cacheBlocks[block_index].items[i] = *((WorkListItem*) ( + data + (i * sizeof(WorkListItem)))); } cacheBlocks[block_index].valid = true; @@ -252,16 +265,32 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) return true; } +PacketPtr +CoalesceEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data) +{ + RequestPtr req = std::make_shared(addr, size, 0, _requestorId); + + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) _requestorId) << 2); + + PacketPtr pkt = new Packet(req, MemCmd::WriteReq); + pkt->allocate(); + pkt->setData(data); + + return pkt; +} + void CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) { - Addr alligned_addr = (addr / 64) * 64; - int block_index = alligned_addr % 256; - int wl_offset = (addr - alligned_addr) / 16; + Addr aligned_addr = (addr / 64) * 64; + int block_index = aligned_addr % 256; + int wl_offset = (addr - aligned_addr) / 16; DPRINTF(MPU, "%s: Recieved a WorkList write. addr: %lu, wl: %s.\n", __func__, addr, wl.to_string()); - DPRINTF(MPU, "%s: alligned_addr: %lu, block_index: %d, wl_offset: %d, " - "takenMask: %u.\n", __func__, alligned_addr, + DPRINTF(MPU, "%s: aligned_addr: %lu, block_index: %d, wl_offset: %d, " + "takenMask: %u.\n", __func__, aligned_addr, block_index, wl_offset, cacheBlocks[block_index].takenMask); assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) == (1 << wl_offset)); @@ -298,35 +327,36 @@ CoalesceEngine::processNextApplyAndCommitEvent() { int block_index = evictQueue.front(); uint8_t changedMask = 0; + // TODO: parameterize 64 to memory atom size uint8_t data[64]; for (int i = 0; i < 4; i++) { uint32_t old_prop = cacheBlocks[block_index].items[i].prop; cacheBlocks[block_index].items[i].prop = std::min( cacheBlocks[block_index].items[i].prop, - cacheBlocks[block_index].items[i].temp_prop); + cacheBlocks[block_index].items[i].tempProp); if (old_prop != cacheBlocks[block_index].items[i].prop) { changedMask |= (1 << i); } DPRINTF(MPU, "%s: Writing WorkListItem[%lu[%d]] to memory. " "WLItem: %s.\n", __func__, cacheBlocks[block_index].addr, i, cacheBlocks[block_index].items[i].to_string()); - uint8_t* wl_data = workListToMemory(cacheBlocks[block_index].items[i]); - std::memcpy(data + (i * 16), wl_data, sizeof(WorkListItem)); + uint8_t* wl_data = (uint8_t*) (cacheBlocks[block_index].items + i); + std::memcpy(data + (i * sizeof(WorkListItem)), + wl_data, sizeof(WorkListItem)); } if (changedMask) { assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize); - PacketPtr write_pkt = getWritePacket( - cacheBlocks[block_index].addr, 64, data, _requestorId); + PacketPtr write_pkt = createWritePacket( + cacheBlocks[block_index].addr, 64, data); if ((cacheBlocks[block_index].hasConflict) && (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)){ Addr miss_addr = MSHRMap[block_index][0]; // TODO: Make sure this trick works; Addr alligned_miss_addr = (miss_addr / 64) * 64; - PacketPtr read_pkt = getReadPacket( - alligned_miss_addr, 64, _requestorId); + PacketPtr read_pkt = createReadPacket(alligned_miss_addr, 64); outstandingMemReqQueue.push(write_pkt); outstandingMemReqQueue.push(read_pkt); // TODO: This should be improved diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index d45fffa3aa..4bb21676d4 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -35,6 +35,8 @@ #include "base/statistics.hh" #include "params/CoalesceEngine.hh" +// TODO: Add parameters for size, memory atom size, type size, +// length of items in the blocks. namespace gem5 { @@ -53,6 +55,13 @@ class CoalesceEngine : public BaseReadEngine bool hasConflict; // TODO: This might be useful in the future // Tick lastWLWriteTick; + Block(): + addr(0), + takenMask(0), + allocated(false), + valid(false), + hasConflict(false) + {} }; WLEngine* peerWLEngine; @@ -74,6 +83,8 @@ class CoalesceEngine : public BaseReadEngine virtual void startup(); + PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data); + EventFunctionWrapper nextMemReqEvent; void processNextMemReqEvent(); @@ -107,7 +118,6 @@ class CoalesceEngine : public BaseReadEngine PARAMS(CoalesceEngine); CoalesceEngine(const CoalesceEngineParams ¶ms); - // ~CoalesceEngine(); void recvFunctional(PacketPtr pkt); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 06b5381641..d09da113ee 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -57,6 +57,19 @@ PushEngine::getPort(const std::string &if_name, PortID idx) } } +void +PushEngine::startup() +{ + uint8_t* first_update_data = new uint8_t [4]; + uint32_t* tempPtr = (uint32_t*) first_update_data; + *tempPtr = 0; + + PacketPtr first_update = createUpdatePacket(0, 4, first_update_data); + + sendPushUpdate(first_update); +} + + void PushEngine::ReqPort::sendPacket(PacketPtr pkt) { @@ -136,7 +149,7 @@ PushEngine::processNextAddrGenEvent() }; for (int index = 0; index < addr_queue.size(); index++) { - PacketPtr pkt = getReadPacket(addr_queue[index], 64, _requestorId); + PacketPtr pkt = createReadPacket(addr_queue[index], 64); reqOffsetMap[pkt->req] = offset_queue[index]; reqNumEdgeMap[pkt->req] = num_edge_queue[index]; reqValueMap[pkt->req] = wl.prop; @@ -182,6 +195,7 @@ PushEngine::handleMemResp(PacketPtr pkt) return true; } +// FIXME: FIX THIS FUNCTION FOR TIMING AND FUNCTIONAL ACCURACY. void PushEngine::processNextPushEvent() { @@ -196,17 +210,16 @@ PushEngine::processNextPushEvent() int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t); for (int i = 0; i < num_edges; i++) { uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes); - Edge e = memoryToEdge(curr_edge_data); + Edge* e = (Edge*) (curr_edge_data); int data_size = sizeof(uint32_t) / sizeof(uint8_t); uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]); // TODO: Implement propagate function here *update_data = value + 1; - PacketPtr update = getUpdatePacket(e.neighbor, - sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data, - _requestorId); + PacketPtr update = createUpdatePacket(e->neighbor, + sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data); DPRINTF(MPU, "%s: Reading %s, updating with %d\n" - , __func__, e.to_string(), *update_data); + , __func__, e->to_string(), *update_data); if (sendPushUpdate(update) && (i == num_edges - 1)) { memRespQueue.pop(); // TODO: Erase map entries here. @@ -218,6 +231,23 @@ PushEngine::processNextPushEvent() } } +PacketPtr +PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t *data) +{ + RequestPtr req = std::make_shared(addr, size, 0, _requestorId); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) _requestorId) << 2); + + // FIXME: MemCmd::UpdateWL + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + + pkt->allocate(); + pkt->setData(data); + + return pkt; +} + bool PushEngine::sendPushUpdate(PacketPtr pkt) { diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index e97a26c7bd..81acc9862b 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -77,6 +77,10 @@ class PushEngine : public BaseReadEngine int onTheFlyReadReqs; std::queue memRespQueue; + virtual void startup(); + + PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t *data); + bool sendPushUpdate(PacketPtr pkt); EventFunctionWrapper nextAddrGenEvent; diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 517d10ef67..b874ec65ec 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -56,53 +56,6 @@ WLEngine::getPort(const std::string &if_name, PortID idx) } } -void -WLEngine::startup() -{ - //FIXME: This is the current version of our initializer. - // This should be updated in the future. - //FIXME: The WLEngine no longer has a MemPort. Update this to - // work with the CoalesceEngine instead. - WorkListItem vertices [5] = { - {10000, 10000, 3, 0}, // Addr: 0 - {10000, 10000, 1, 3}, // Addr: 16 - {10000, 10000, 1, 4}, // Addr: 32 - {10000, 10000, 1, 5}, // Addr: 48 - {10000, 10000, 0, 6} // Addr: 64 - }; - Edge edges [7] = { - {0, 16}, // Addr: 1048576 - {0, 32}, // Addr: 1048592 - {0, 48}, // Addr: 1048608 - {0, 32}, // Addr: 1048624 - {0, 64}, // Addr: 1048640 - {0, 32} - }; - - for (int i = 0; i < 5; i++) { - uint8_t* data = workListToMemory(vertices[i]); - PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem), - 16, data, 0); - coalesceEngine->recvFunctional(pkt); - } - - for (int i = 0; i < 7; i++) { - uint8_t* data = edgeToMemory(edges[i]); - PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge), - 16, data, 0); - coalesceEngine->recvFunctional(pkt); - } - - uint8_t* first_update_data = new uint8_t [4]; - uint32_t* tempPtr = (uint32_t*) first_update_data; - *tempPtr = 0; - - PacketPtr first_update = getUpdatePacket( - 0, 4, first_update_data, _requestorId); - - handleIncomingUpdate(first_update); -} - AddrRangeList WLEngine::RespPort::getAddrRanges() const { @@ -152,6 +105,7 @@ WLEngine::processNextReadEvent() Addr update_addr = update->getAddr(); uint32_t* update_value = update->getPtr(); + // FIXME: else logic is wrong if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) && (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) { if (coalesceEngine->recvReadAddr(update_addr)) { @@ -178,6 +132,7 @@ WLEngine::processNextReadEvent() // TODO: Add a stat to count the number of coalescions } + // TODO: Only schedule nextReadEvent only when it has to be scheduled if ((!nextReadEvent.scheduled()) && (!updateQueue.empty())) { schedule(nextReadEvent, nextCycle()); @@ -208,11 +163,12 @@ WLEngine::processNextReduceEvent() WorkListItem wl = it->second; uint32_t update_value = onTheFlyUpdateMap[addr]; DPRINTF(MPU, "%s: updating WorkList[%lu] with the current temp_prop: " - "%d, with new update: %d.\n", __func__, addr, wl.temp_prop, + "%d, with new update: %d.\n", __func__, addr, wl.tempProp, onTheFlyUpdateMap[addr]); // TODO: Generalize this to reduce function rather than just min + wl.tempProp = std::min(update_value, wl.tempProp); stats.numReduce++; - wl.temp_prop = std::min(update_value, wl.temp_prop); + coalesceEngine->recvWLWrite(addr, wl); servicedAddresses.push_back(addr); it++; @@ -227,16 +183,15 @@ WLEngine::processNextReduceEvent() bool WLEngine::handleIncomingUpdate(PacketPtr pkt) { - // TODO: Coalesce updates here too assert(updateQueue.size() <= updateQueueSize); if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) { return false; } updateQueue.push(pkt); + assert(!updateQueue.empty()); DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size()); - if ((!nextReadEvent.scheduled()) && - (!updateQueue.empty())) { + if (!nextReadEvent.scheduled()) { schedule(nextReadEvent, nextCycle()); } return true; diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 891916e7af..ef18956ec1 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -73,7 +73,6 @@ class WLEngine : public BaseReduceEngine std::unordered_map onTheFlyUpdateMap; std::unordered_map addrWorkListMap; - virtual void startup(); void recvFunctional(PacketPtr pkt); From aa5a5e06804582845ae1c33732d759a1d51a3ece Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 31 Mar 2022 18:03:35 -0700 Subject: [PATCH 076/247] Fixing base_edge_addr in config and debugs. --- configs/accl/sega.py | 6 +++--- src/accl/graph/base/SConscript | 1 - src/accl/graph/sega/push_engine.cc | 11 +++++++---- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 8ea247106e..680157ba7e 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -4,7 +4,7 @@ class MPU(SubSystem): def __init__(self): super(MPU, self).__init__() - self.push_engine = PushEngine(base_edge_addr=0x100000, + self.push_engine = PushEngine(base_edge_addr=0x80000000, push_req_queue_size = 16) self.coalesce_engine = CoalesceEngine( peer_push_engine=self.push_engine) @@ -60,9 +60,9 @@ def __init__(self): self.mpu = MPU() self.mem_ctrl = MPUMemory( vertex_range=AddrRange(start=0x000000, size="2GiB"), - vertex_binary="live-journal/graph_binaries/vertices", + vertex_binary="epinions/graph_binaries/vertices", edge_range=AddrRange(start=0x80000000, size="2GiB"), - edge_binary="live-journal/graph_binaries/edgelist_0") + edge_binary="epinions/graph_binaries/edgelist_0") self.mpu.setReqPort(self.mpu.getRespPort()) self.mpu.setMemPort(self.mem_ctrl.getPort()) diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript index 8aefca2185..ea96f4323b 100644 --- a/src/accl/graph/base/SConscript +++ b/src/accl/graph/base/SConscript @@ -32,4 +32,3 @@ SimObject('BaseReduceEngine.py') Source('base_read_engine.cc') Source('base_reduce_engine.cc') -Source('util.cc') diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index d09da113ee..c305a4bbb9 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -203,23 +203,26 @@ PushEngine::processNextPushEvent() RequestPtr req = pkt->req; uint8_t *data = pkt->getPtr(); + DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu.\n", + __func__, pkt->getAddr()); + Addr offset = reqOffsetMap[req]; int num_edges = reqNumEdgeMap[req]; uint32_t value = reqValueMap[req]; - int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t); for (int i = 0; i < num_edges; i++) { - uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes); + uint8_t *curr_edge_data = data + offset + (i * sizeof(Edge)); Edge* e = (Edge*) (curr_edge_data); + DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string()); int data_size = sizeof(uint32_t) / sizeof(uint8_t); uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]); // TODO: Implement propagate function here *update_data = value + 1; + DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n", + __func__, e->neighbor, *update_data); PacketPtr update = createUpdatePacket(e->neighbor, sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data); - DPRINTF(MPU, "%s: Reading %s, updating with %d\n" - , __func__, e->to_string(), *update_data); if (sendPushUpdate(update) && (i == num_edges - 1)) { memRespQueue.pop(); // TODO: Erase map entries here. From b8df760f0512d590c32826349e408cebe0e075bb Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Thu, 31 Mar 2022 19:00:29 -0700 Subject: [PATCH 077/247] Changing queue to deque --- src/accl/graph/base/base_read_engine.hh | 1 - src/accl/graph/sega/coalesce_engine.cc | 22 +++++++++++----------- src/accl/graph/sega/coalesce_engine.hh | 6 +++--- src/accl/graph/sega/push_engine.cc | 12 ++++++------ src/accl/graph/sega/push_engine.hh | 6 +++--- src/accl/graph/sega/wl_engine.cc | 6 +++--- src/accl/graph/sega/wl_engine.hh | 2 +- 7 files changed, 27 insertions(+), 28 deletions(-) diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh index 591b51aeb7..e21aaa01d2 100644 --- a/src/accl/graph/base/base_read_engine.hh +++ b/src/accl/graph/base/base_read_engine.hh @@ -29,7 +29,6 @@ #ifndef __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__ #define __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__ -#include #include #include "base/addr_range.hh" diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 67874cb9b9..9fed1e8230 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -85,8 +85,8 @@ CoalesceEngine::recvReadAddr(Addr addr) DPRINTF(MPU, "%s: Read request with addr: %lu hit in the cache.\n" , __func__, addr); // TODO: Make addrQueue and wlQueue into one std::pair - addrResponseQueue.push(addr); - worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]); + addrResponseQueue.push_back(addr); + worklistResponseQueue.push_back(cacheBlocks[block_index].items[wl_offset]); // TODO: Use a bitset instead of unsigned int for takenMask cacheBlocks[block_index].takenMask |= (1 << wl_offset); @@ -143,7 +143,7 @@ CoalesceEngine::recvReadAddr(Addr addr) MSHRMap[block_index].push_back(addr); // TODO: Parameterize 64 to memory atom size PacketPtr pkt = createReadPacket(aligned_addr, 64); - outstandingMemReqQueue.push(pkt); + outstandingMemReqQueue.push_back(pkt); stats.numVertexBlockReads++; @@ -175,7 +175,7 @@ CoalesceEngine::processNextMemReqEvent() if (!memPortBlocked()) { sendMemReq(pkt); - outstandingMemReqQueue.pop(); + outstandingMemReqQueue.pop_front(); } if ((!nextMemReqEvent.scheduled()) && @@ -192,8 +192,8 @@ CoalesceEngine::processNextRespondEvent() peerWLEngine->handleIncomingWL(addr_response, worklist_response); - addrResponseQueue.pop(); - worklistResponseQueue.pop(); + addrResponseQueue.pop_front(); + worklistResponseQueue.pop_front(); if ((!nextRespondEvent.scheduled()) && (!worklistResponseQueue.empty()) && @@ -234,8 +234,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) if (alligned_miss_addr == addr) { int wl_offset = (miss_addr - alligned_miss_addr) / 16; - addrResponseQueue.push(miss_addr); - worklistResponseQueue.push( + addrResponseQueue.push_back(miss_addr); + worklistResponseQueue.push_back( cacheBlocks[block_index].items[wl_offset]); cacheBlocks[block_index].takenMask |= (1 << wl_offset); stats.numVertexReads++; @@ -357,8 +357,8 @@ CoalesceEngine::processNextApplyAndCommitEvent() // TODO: Make sure this trick works; Addr alligned_miss_addr = (miss_addr / 64) * 64; PacketPtr read_pkt = createReadPacket(alligned_miss_addr, 64); - outstandingMemReqQueue.push(write_pkt); - outstandingMemReqQueue.push(read_pkt); + outstandingMemReqQueue.push_back(write_pkt); + outstandingMemReqQueue.push_back(read_pkt); // TODO: This should be improved if ((changedMask & (1)) == 1) { peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]); @@ -381,7 +381,7 @@ CoalesceEngine::processNextApplyAndCommitEvent() __func__, evictQueue.size()); } else if ((!cacheBlocks[block_index].hasConflict) && (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) { - outstandingMemReqQueue.push(write_pkt); + outstandingMemReqQueue.push_back(write_pkt); // TODO: This should be improved if ((changedMask & (1)) == 1) { peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]); diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 4bb21676d4..2cb9856f76 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -74,10 +74,10 @@ class CoalesceEngine : public BaseReadEngine std::unordered_map> MSHRMap; int outstandingMemReqQueueSize; - std::queue outstandingMemReqQueue; + std::deque outstandingMemReqQueue; - std::queue addrResponseQueue; - std::queue worklistResponseQueue; + std::deque addrResponseQueue; + std::deque worklistResponseQueue; std::deque evictQueue; diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index c305a4bbb9..450ba9ddc4 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -109,7 +109,7 @@ PushEngine::recvWLItem(WorkListItem wl) return false; } - pushReqQueue.push(wl); + pushReqQueue.push_back(wl); if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) { @@ -153,10 +153,10 @@ PushEngine::processNextAddrGenEvent() reqOffsetMap[pkt->req] = offset_queue[index]; reqNumEdgeMap[pkt->req] = num_edge_queue[index]; reqValueMap[pkt->req] = wl.prop; - pendingReadReqs.push(pkt); + pendingReadReqs.push_back(pkt); } - pushReqQueue.pop(); + pushReqQueue.pop_front(); if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) { schedule(nextAddrGenEvent, nextCycle()); @@ -175,7 +175,7 @@ PushEngine::processNextReadEvent() PacketPtr pkt = pendingReadReqs.front(); sendMemReq(pkt); onTheFlyReadReqs++; - pendingReadReqs.pop(); + pendingReadReqs.pop_front(); } if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) { @@ -187,7 +187,7 @@ bool PushEngine::handleMemResp(PacketPtr pkt) { onTheFlyReadReqs--; - memRespQueue.push(pkt); + memRespQueue.push_back(pkt); if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) { schedule(nextPushEvent, nextCycle()); @@ -224,7 +224,7 @@ PushEngine::processNextPushEvent() sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data); if (sendPushUpdate(update) && (i == num_edges - 1)) { - memRespQueue.pop(); + memRespQueue.pop_front(); // TODO: Erase map entries here. } } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 81acc9862b..1b1a812d16 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -64,18 +64,18 @@ class PushEngine : public BaseReadEngine Addr baseEdgeAddr; int pushReqQueueSize; - std::queue pushReqQueue; + std::deque pushReqQueue; std::unordered_map reqOffsetMap; std::unordered_map reqNumEdgeMap; std::unordered_map reqValueMap; // TODO: Possibility of infinite queueing - std::queue pendingReadReqs; + std::deque pendingReadReqs; int memRespQueueSize; int onTheFlyReadReqs; - std::queue memRespQueue; + std::deque memRespQueue; virtual void startup(); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index b874ec65ec..73eacf945f 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -115,7 +115,7 @@ WLEngine::processNextReadEvent() onTheFlyUpdateMap[update_addr] = *update_value; DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n", __func__, update_addr, onTheFlyUpdateMap[update_addr]); - updateQueue.pop(); + updateQueue.pop_front(); DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size()); } } else { @@ -127,7 +127,7 @@ WLEngine::processNextReadEvent() onTheFlyUpdateMap[update_addr] = std::min(*update_value, onTheFlyUpdateMap[update_addr]); stats.onTheFlyCoalesce++; - updateQueue.pop(); + updateQueue.pop_front(); DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size()); // TODO: Add a stat to count the number of coalescions } @@ -188,7 +188,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt) return false; } - updateQueue.push(pkt); + updateQueue.push_back(pkt); assert(!updateQueue.empty()); DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size()); if (!nextReadEvent.scheduled()) { diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index ef18956ec1..c1ef028f77 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -67,7 +67,7 @@ class WLEngine : public BaseReduceEngine CoalesceEngine* coalesceEngine; int updateQueueSize; - std::queue updateQueue; + std::deque updateQueue; int onTheFlyUpdateMapSize; std::unordered_map onTheFlyUpdateMap; From 2bfc6c7d5f6c2cb4911a2b72a228be95312a8dad Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 31 Mar 2022 20:25:31 -0700 Subject: [PATCH 078/247] Removing old files and renaming utils to data_structs. --- src/accl/graph/base/base_reduce_engine.hh | 5 - .../graph/base/{util.hh => data_structs.hh} | 0 src/accl/graph/base/old/BaseApplyEngine.py | 36 ---- src/accl/graph/base/old/BaseEngine.py | 39 ----- src/accl/graph/base/old/BasePushEngine.py | 36 ---- src/accl/graph/base/old/BaseWLEngine.py | 36 ---- src/accl/graph/base/old/base_apply_engine.cc | 137 --------------- src/accl/graph/base/old/base_apply_engine.hh | 72 -------- src/accl/graph/base/old/base_engine.cc | 100 ----------- src/accl/graph/base/old/base_engine.hh | 98 ----------- src/accl/graph/base/old/base_push_engine.cc | 145 ---------------- src/accl/graph/base/old/base_push_engine.hh | 82 --------- src/accl/graph/base/old/base_wl_engine.cc | 134 --------------- src/accl/graph/base/old/base_wl_engine.hh | 83 ---------- src/accl/graph/sega/coalesce_engine.hh | 2 +- src/accl/graph/sega/old/ApplyEngine.py | 38 ----- src/accl/graph/sega/old/LockDir.py | 46 ------ src/accl/graph/sega/old/PushEngine.py | 37 ----- src/accl/graph/sega/old/WLEngine.py | 40 ----- src/accl/graph/sega/old/apply_engine.cc | 58 ------- src/accl/graph/sega/old/apply_engine.hh | 67 -------- src/accl/graph/sega/old/lock_dir.cc | 63 ------- src/accl/graph/sega/old/lock_dir.hh | 57 ------- src/accl/graph/sega/old/push_engine.cc | 90 ---------- src/accl/graph/sega/old/push_engine.hh | 77 --------- src/accl/graph/sega/old/wl_engine.cc | 156 ------------------ src/accl/graph/sega/old/wl_engine.hh | 86 ---------- src/accl/graph/sega/push_engine.hh | 2 +- src/accl/graph/sega/wl_engine.hh | 3 +- 29 files changed, 4 insertions(+), 1821 deletions(-) rename src/accl/graph/base/{util.hh => data_structs.hh} (100%) delete mode 100644 src/accl/graph/base/old/BaseApplyEngine.py delete mode 100644 src/accl/graph/base/old/BaseEngine.py delete mode 100644 src/accl/graph/base/old/BasePushEngine.py delete mode 100644 src/accl/graph/base/old/BaseWLEngine.py delete mode 100644 src/accl/graph/base/old/base_apply_engine.cc delete mode 100644 src/accl/graph/base/old/base_apply_engine.hh delete mode 100644 src/accl/graph/base/old/base_engine.cc delete mode 100644 src/accl/graph/base/old/base_engine.hh delete mode 100644 src/accl/graph/base/old/base_push_engine.cc delete mode 100644 src/accl/graph/base/old/base_push_engine.hh delete mode 100644 src/accl/graph/base/old/base_wl_engine.cc delete mode 100644 src/accl/graph/base/old/base_wl_engine.hh delete mode 100644 src/accl/graph/sega/old/ApplyEngine.py delete mode 100644 src/accl/graph/sega/old/LockDir.py delete mode 100644 src/accl/graph/sega/old/PushEngine.py delete mode 100644 src/accl/graph/sega/old/WLEngine.py delete mode 100644 src/accl/graph/sega/old/apply_engine.cc delete mode 100644 src/accl/graph/sega/old/apply_engine.hh delete mode 100644 src/accl/graph/sega/old/lock_dir.cc delete mode 100644 src/accl/graph/sega/old/lock_dir.hh delete mode 100644 src/accl/graph/sega/old/push_engine.cc delete mode 100644 src/accl/graph/sega/old/push_engine.hh delete mode 100644 src/accl/graph/sega/old/wl_engine.cc delete mode 100644 src/accl/graph/sega/old/wl_engine.hh diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh index f2245f571f..c8c9784ed1 100644 --- a/src/accl/graph/base/base_reduce_engine.hh +++ b/src/accl/graph/base/base_reduce_engine.hh @@ -29,8 +29,6 @@ #ifndef __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__ #define __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__ - -#include "accl/graph/base/util.hh" #include "params/BaseReduceEngine.hh" #include "sim/clocked_object.hh" #include "sim/system.hh" @@ -43,7 +41,6 @@ class BaseReduceEngine : public ClockedObject private: System* system; - protected: const RequestorID _requestorId; @@ -55,8 +52,6 @@ class BaseReduceEngine : public ClockedObject ~BaseReduceEngine(); RequestorID requestorId() { return _requestorId; } - - virtual void handleIncomingWL(Addr addr, WorkListItem wl) = 0; }; } diff --git a/src/accl/graph/base/util.hh b/src/accl/graph/base/data_structs.hh similarity index 100% rename from src/accl/graph/base/util.hh rename to src/accl/graph/base/data_structs.hh diff --git a/src/accl/graph/base/old/BaseApplyEngine.py b/src/accl/graph/base/old/BaseApplyEngine.py deleted file mode 100644 index 9b240581ac..0000000000 --- a/src/accl/graph/base/old/BaseApplyEngine.py +++ /dev/null @@ -1,36 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (c) 2017 Jason Lowe-Power -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from m5.params import * -from m5.proxy import * -from m5.objects.BaseEngine import BaseEngine - -class BaseApplyEngine(BaseEngine): - abstract = True - type = 'BaseApplyEngine' - cxx_header = 'accl/graph/base/base_apply_engine.hh' - cxx_class = 'gem5::BaseApplyEngine' diff --git a/src/accl/graph/base/old/BaseEngine.py b/src/accl/graph/base/old/BaseEngine.py deleted file mode 100644 index 16c2f402e5..0000000000 --- a/src/accl/graph/base/old/BaseEngine.py +++ /dev/null @@ -1,39 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (c) 2017 Jason Lowe-Power -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from m5.params import * -from m5.proxy import * -from m5.objects.ClockedObject import ClockedObject - -class BaseEngine(ClockedObject): - abstract = True - type = 'BaseEngine' - cxx_header = "accl/graph/base/base_engine.hh" - cxx_class = 'gem5::BaseEngine' - - system = Param.System(Parent.any, 'System this Engine is a part of') - mem_port = RequestPort("Port to communicate with the memory") diff --git a/src/accl/graph/base/old/BasePushEngine.py b/src/accl/graph/base/old/BasePushEngine.py deleted file mode 100644 index 2163864be3..0000000000 --- a/src/accl/graph/base/old/BasePushEngine.py +++ /dev/null @@ -1,36 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (c) 2017 Jason Lowe-Power -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from m5.params import * -from m5.proxy import * -from m5.objects.BaseEngine import BaseEngine - -class BasePushEngine(BaseEngine): - abstract = True - type = 'BasePushEngine' - cxx_header = "accl/graph/base/base_push_engine.hh" - cxx_class = 'gem5::BasePushEngine' diff --git a/src/accl/graph/base/old/BaseWLEngine.py b/src/accl/graph/base/old/BaseWLEngine.py deleted file mode 100644 index 7311c396b3..0000000000 --- a/src/accl/graph/base/old/BaseWLEngine.py +++ /dev/null @@ -1,36 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (c) 2017 Jason Lowe-Power -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from m5.params import * -from m5.proxy import * -from m5.objects.BaseEngine import BaseEngine - -class BaseWLEngine(BaseEngine): - abstract = True - type = 'BaseWLEngine' - cxx_header = "accl/graph/base/base_wl_engine.hh" - cxx_class = 'gem5::BaseWLEngine' diff --git a/src/accl/graph/base/old/base_apply_engine.cc b/src/accl/graph/base/old/base_apply_engine.cc deleted file mode 100644 index 39f5dafc67..0000000000 --- a/src/accl/graph/base/old/base_apply_engine.cc +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "accl/graph/base/base_apply_engine.hh" - -#include - -#include "accl/graph/base/util.hh" -#include "debug/MPU.hh" - - -namespace gem5 -{ - -BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams ¶ms): - BaseEngine(params), - nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()), - nextApplyEvent([this]{ processNextApplyEvent(); }, name()) -{} - -bool -BaseApplyEngine::recvWLNotif(Addr addr) -{ - // TODO: Investigate the situation where the queue is full. - applyReadQueue.push(addr); - if (!nextApplyCheckEvent.scheduled()){ - schedule(nextApplyCheckEvent, nextCycle()); - } - return true; -} - -void -BaseApplyEngine::processNextApplyCheckEvent() -{ - // TODO: We might want to change the way this function - // pops items off queue, maybe we should pop every n cycles - // or change the clock domain for this simobject. - Addr addr = applyReadQueue.front(); - Addr req_addr = (addr / 64) * 64; - Addr req_offset = (addr % 64); - if (acquireAddress(req_addr)) { - PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId); - requestOffset[memPkt->req] = req_offset; - if (!memPortBlocked()) { - sendMemReq(memPkt); - applyReadQueue.pop(); - } - } - if (!applyReadQueue.empty() && !nextApplyCheckEvent.scheduled()){ - schedule(nextApplyCheckEvent, nextCycle()); - } -} - -void -BaseApplyEngine::processNextApplyEvent() -{ - PacketPtr pkt = memRespQueue.front(); - uint8_t* data = pkt->getPtr(); - - RequestPtr request = pkt->req; - Addr request_offset = requestOffset[request]; - - WorkListItem wl = memoryToWorkList(data + request_offset); - DPRINTF(MPU, "%s: Apply Engine is reading WorkList Item[%lu]: %s\n" - , __func__, pkt->getAddr() + request_offset, wl.to_string()); - // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem - // to applyengine if temp_prop < prop. If temp_prop has not changed, why - // fwd it to applyengine? - if (wl.temp_prop < wl.prop) { - // TODO: instead of min add a Reduce function. - //update prop with temp_prop - wl.prop = wl.temp_prop; - //write back the new worklist item to memory - uint8_t* wList = workListToMemory(wl); - memcpy(data + request_offset, wList, sizeof(WorkListItem)); - //Create memory write requests. - PacketPtr writePkt = - getWritePacket(pkt->getAddr(), 64, data, requestorId); - - DPRINTF(MPU, "%s: Sending a pkt with this info. " - "pkt->addr: %lu, pkt->size: %lu\npkt->data: %s\n", - __func__, writePkt->getAddr(), - writePkt->getSize(), writePkt->printData()); - - if (!memPortBlocked()) { - if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) { - sendMemReq(writePkt); - memRespQueue.pop(); - DPRINTF(MPU, "%s: The Apply Engine is applying the new value into WorkList Item[%lu]: %s\n" - , __func__, pkt->getAddr() + request_offset, wl.to_string()); - } - } - } else { - memRespQueue.pop(); - } - if (!releaseAddress(pkt->getAddr())) { - panic("Could not release an address"); - } - if (!nextApplyEvent.scheduled() && !memRespQueue.empty()){ - schedule(nextApplyEvent, nextCycle()); - } -} - -void -BaseApplyEngine::scheduleMainEvent() -{ - if (!memRespQueue.empty() && !nextApplyEvent.scheduled()) { - schedule(nextApplyEvent, nextCycle()); - } -} - -} diff --git a/src/accl/graph/base/old/base_apply_engine.hh b/src/accl/graph/base/old/base_apply_engine.hh deleted file mode 100644 index f4df298079..0000000000 --- a/src/accl/graph/base/old/base_apply_engine.hh +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__ -#define __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__ - -#include -#include - -#include "accl/graph/base/base_engine.hh" -#include "mem/request.hh" -#include "params/BaseApplyEngine.hh" - -namespace gem5 -{ - -class BaseApplyEngine : public BaseEngine -{ - private: - std::queue applyReadQueue; - - std::unordered_map requestOffset; - - EventFunctionWrapper nextApplyCheckEvent; - void processNextApplyCheckEvent(); - - EventFunctionWrapper nextApplyEvent; - void processNextApplyEvent(); - - protected: - virtual bool sendApplyNotif(uint32_t prop, - uint32_t degree, uint32_t edgeIndex) = 0; - virtual bool acquireAddress(Addr addr) = 0; - virtual bool releaseAddress(Addr addr) = 0; - virtual void scheduleMainEvent() override; - - public: - PARAMS(BaseApplyEngine); - - BaseApplyEngine(const BaseApplyEngineParams &apply); - - bool recvWLNotif(Addr addr); -}; - -} - -#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__ diff --git a/src/accl/graph/base/old/base_engine.cc b/src/accl/graph/base/old/base_engine.cc deleted file mode 100644 index ad87bb3662..0000000000 --- a/src/accl/graph/base/old/base_engine.cc +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "accl/graph/base/base_engine.hh" -#include "debug/MPU.hh" -namespace gem5 -{ - -BaseEngine::BaseEngine(const BaseEngineParams ¶ms) : - ClockedObject(params), - system(params.system), - memPort(name() + ".memPort", this), - requestorId(system->getRequestorId(this)) -{ - DPRINTF(MPU, "%s: My requestorId is %u,\n", __func__, requestorId); -} - -BaseEngine::~BaseEngine() -{} - -Port& -BaseEngine::getPort(const std::string &if_name, PortID idx) -{ - if (if_name == "mem_port") { - return memPort; - } else { - return SimObject::getPort(if_name, idx); - } -} - -void -BaseEngine::MemPort::sendPacket(PacketPtr pkt) -{ - panic_if(_blocked, "Should never try to send if blocked MemSide!"); - // If we can't send the packet across the port, store it for later. - if (!sendTimingReq(pkt)) - { - blockedPacket = pkt; - _blocked = true; - } -} - -bool -BaseEngine::MemPort::recvTimingResp(PacketPtr pkt) -{ - //TODO: Investigate sending true all the time - return owner->handleMemResp(pkt); - -} - -void -BaseEngine::MemPort::recvReqRetry() -{ - panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); - - _blocked = false; - sendPacket(blockedPacket); - - if (!blocked()) { - blockedPacket = nullptr; - } -} - -bool -BaseEngine::handleMemResp(PacketPtr pkt) -{ - if (pkt->isResponse() && pkt->isWrite()) { - return true; - } - memRespQueue.push(pkt); - scheduleMainEvent(); - return true; -} - -} diff --git a/src/accl/graph/base/old/base_engine.hh b/src/accl/graph/base/old/base_engine.hh deleted file mode 100644 index 53415ddc7c..0000000000 --- a/src/accl/graph/base/old/base_engine.hh +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __ACCL_GRAPH_BASE_BASE_ENGINE_HH__ -#define __ACCL_GRAPH_BASE_BASE_ENGINE_HH__ - -#include -#include - -#include "base/addr_range.hh" -#include "mem/packet.hh" -#include "mem/port.hh" -#include "params/BaseEngine.hh" -#include "sim/clocked_object.hh" -#include "sim/system.hh" - -namespace gem5 -{ - -class BaseEngine : public ClockedObject -{ - private: - class MemPort : public RequestPort - { - private: - BaseEngine* owner; - bool _blocked; - PacketPtr blockedPacket; - - public: - MemPort(const std::string& name, BaseEngine* owner): - RequestPort(name, owner), owner(owner), - _blocked(false), blockedPacket(nullptr) - {} - - void sendPacket(PacketPtr pkt); - bool blocked() { return _blocked; } - - protected: - virtual bool recvTimingResp(PacketPtr pkt); - virtual void recvReqRetry(); - }; - - System* system; - MemPort memPort; - - bool handleMemResp(PacketPtr resp); - - protected: - const RequestorID requestorId; - // TODO: Add this later, maybe? - // int memRespQueueSize; - std::queue memRespQueue; - - bool memPortBlocked() { return memPort.blocked(); } - void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); } - void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } - AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); } - - virtual void scheduleMainEvent() = 0; - - public: - PARAMS(BaseEngine); - - BaseEngine(const BaseEngineParams ¶ms); - ~BaseEngine(); - Port& getPort(const std::string &if_name, - PortID idx=InvalidPortID) override; -}; - -} - -#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__ diff --git a/src/accl/graph/base/old/base_push_engine.cc b/src/accl/graph/base/old/base_push_engine.cc deleted file mode 100644 index 4ebe40e486..0000000000 --- a/src/accl/graph/base/old/base_push_engine.cc +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (c) 2021 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "accl/graph/base/base_push_engine.hh" - -#include "accl/graph/base/util.hh" -#include "debug/MPU.hh" - -namespace gem5 -{ - -BasePushEngine::BasePushEngine(const BasePushEngineParams ¶ms) : - BaseEngine(params), - nextReadEvent([this] { processNextReadEvent(); }, name()), - nextPushEvent([this] { processNextPushEvent(); }, name()) -{} - -bool -BasePushEngine::recvApplyNotif(uint32_t prop, - uint32_t degree, uint32_t edge_index) -{ - notifQueue.emplace(prop, degree, edge_index); - if (!nextReadEvent.scheduled()) { - schedule(nextReadEvent, nextCycle()); - } - DPRINTF(MPU, "%s: Reading %d edges.", __func__, degree); - return true; -} - -void -BasePushEngine::processNextReadEvent() -{ - ApplyNotif notif = notifQueue.front(); - - std::vector addr_queue; - std::vector offset_queue; - std::vector num_edge_queue; - - for (uint32_t index = 0; index < notif.degree; index++) { - // FIXME: For now the base edge address is 1048576 - Addr edge_addr = 1048576 + (notif.edgeIndex + index) * sizeof(Edge); - Addr req_addr = (edge_addr / 64) * 64; - Addr req_offset = edge_addr % 64; - if (addr_queue.size()) { - if (addr_queue.back() == req_addr) { - num_edge_queue.back()++; - } - else { - addr_queue.push_back(req_addr); - offset_queue.push_back(req_offset); - num_edge_queue.push_back(1); - } - } - else { - addr_queue.push_back(req_addr); - offset_queue.push_back(req_offset); - num_edge_queue.push_back(1); - } - }; - - for (int index = 0; index < addr_queue.size(); index++) { - if (!memPortBlocked()) { - PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId); - reqOffsetMap[pkt->req] = offset_queue[index]; - reqNumEdgeMap[pkt->req] = num_edge_queue[index]; - reqValueMap[pkt->req] = notif.prop; - sendMemReq(pkt); - notifQueue.pop(); - } - } - - if (!nextReadEvent.scheduled() && !notifQueue.empty()) { - schedule(nextReadEvent, nextCycle()); - } -} - -void -BasePushEngine::processNextPushEvent() -{ - PacketPtr pkt = memRespQueue.front(); - RequestPtr req = pkt->req; - uint8_t *data = pkt->getPtr(); - - Addr offset = reqOffsetMap[req]; - int num_edges = reqNumEdgeMap[req]; - uint32_t value = reqValueMap[req]; - - int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t); - for (int i = 0; i < num_edges; i++) { - uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes); - Edge e = memoryToEdge(curr_edge_data); - int data_size = sizeof(uint32_t) / sizeof(uint8_t); - uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]); - // TODO: Implement propagate function here - *update_data = value + 1; - PacketPtr update = getUpdatePacket(e.neighbor, - sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data, - requestorId); - if (sendPushUpdate(update) && (i == num_edges - 1)) { - memRespQueue.pop(); - DPRINTF(MPU, "%s: Reading %s, updating with %d\n" - , __func__, e.to_string(), *update_data); - // TODO: Erase map entries here. - } - } - - if (!nextPushEvent.scheduled() && !memRespQueue.empty()) { - schedule(nextPushEvent, nextCycle()); - } -} - -void -BasePushEngine::scheduleMainEvent() -{ - if (!memRespQueue.empty() && !nextPushEvent.scheduled()) { - schedule(nextPushEvent, nextCycle()); - } -} - -} diff --git a/src/accl/graph/base/old/base_push_engine.hh b/src/accl/graph/base/old/base_push_engine.hh deleted file mode 100644 index 01027d2791..0000000000 --- a/src/accl/graph/base/old/base_push_engine.hh +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright (c) 2021 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__ -#define __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__ - -#include - -#include "accl/graph/base/base_engine.hh" -#include "mem/request.hh" -#include "params/BasePushEngine.hh" - -namespace gem5 -{ - -class BasePushEngine : public BaseEngine -{ - private: - struct ApplyNotif { - uint32_t prop; - uint32_t degree; - uint32_t edgeIndex; - - ApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index): - prop(prop), degree(degree), edgeIndex(edge_index) - {} - }; - - std::queue notifQueue; - // int notifQueueSize; - - std::unordered_map reqOffsetMap; - std::unordered_map reqNumEdgeMap; - std::unordered_map reqValueMap; - - EventFunctionWrapper nextReadEvent; - void processNextReadEvent(); - - EventFunctionWrapper nextPushEvent; - void processNextPushEvent(); - - protected: - virtual bool sendPushUpdate(PacketPtr pkt) = 0; - virtual void scheduleMainEvent() override; - - public: - - PARAMS(BasePushEngine); - - BasePushEngine(const BasePushEngineParams ¶ms); - - bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index); -}; - -} - -#endif // __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__ diff --git a/src/accl/graph/base/old/base_wl_engine.cc b/src/accl/graph/base/old/base_wl_engine.cc deleted file mode 100644 index fd45b85077..0000000000 --- a/src/accl/graph/base/old/base_wl_engine.cc +++ /dev/null @@ -1,134 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "accl/graph/base/base_wl_engine.hh" -#include "debug/MPU.hh" - -#include - -namespace gem5 -{ - -BaseWLEngine::BaseWLEngine(const BaseWLEngineParams ¶ms): - BaseEngine(params), - nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()), - nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name()) -{} - -bool -BaseWLEngine::handleWLUpdate(PacketPtr pkt) -{ - updateQueue.push(pkt); - if(!nextWLReadEvent.scheduled()) { - schedule(nextWLReadEvent, nextCycle()); - } - return true; -} - -void BaseWLEngine::processNextWLReadEvent() -{ - PacketPtr pkt = updateQueue.front(); - uint32_t value = *(pkt->getPtr()); - - Addr addr = pkt->getAddr(); - Addr req_addr = (addr / 64) * 64; - Addr req_offset = addr % 64; - - if (acquireAddress(req_addr)) { - PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId); - requestOffsetMap[memPkt->req] = req_offset; - requestValueMap[memPkt->req] = value; - - if (!memPortBlocked()) { - sendMemReq(memPkt); - updateQueue.pop(); - } - else{ - releaseAddress(req_addr); - } - } - if (!nextWLReadEvent.scheduled() && !updateQueue.empty()) { - schedule(nextWLReadEvent, nextCycle()); - } -} - -void -BaseWLEngine::processNextWLReduceEvent() -{ - PacketPtr resp = memRespQueue.front(); - uint8_t* respData = resp->getPtr(); - Addr request_offset = requestOffsetMap[resp->req]; - uint32_t value = requestValueMap[resp->req]; - WorkListItem wl = memoryToWorkList(respData + request_offset); - - DPRINTF(MPU, "%s: The WLE is reading WorkList item [%lu]: %s %d\n" - , __func__, resp->getAddr() + request_offset, wl.to_string(), value); - if (value < wl.temp_prop){ - //update prop with temp_prop - wl.temp_prop = value; - - uint8_t* wlData = workListToMemory(wl); - memcpy(respData + request_offset, wlData, sizeof(WorkListItem)); - PacketPtr writePkt = - getWritePacket(resp->getAddr(), 64, respData, requestorId); - - DPRINTF(MPU, "%s: Sending a pkt with this info. " - "pkt->addr: %lu, pkt->size: %lu\npkt->data: %s\n", - __func__, writePkt->getAddr(), - writePkt->getSize(), writePkt->printData()); - if (!memPortBlocked()) { - if (sendWLNotif(resp->getAddr() + request_offset)) { - sendMemReq(writePkt); - memRespQueue.pop(); - DPRINTF(MPU, "%s: The WLE is changing to: %s\n" - , __func__, wl.to_string()); - // TODO: Erase map entries, delete wlData; - } - } - } - else { - memRespQueue.pop(); - } - if (!releaseAddress(resp->getAddr())) { - panic("Could not release an address"); - } - if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){ - schedule(nextWLReduceEvent, nextCycle()); - } -} - -void -BaseWLEngine::scheduleMainEvent() -{ - if (!memRespQueue.empty() && !nextWLReduceEvent.scheduled()) { - schedule(nextWLReduceEvent, nextCycle()); - } -} - - -} diff --git a/src/accl/graph/base/old/base_wl_engine.hh b/src/accl/graph/base/old/base_wl_engine.hh deleted file mode 100644 index 15371f965b..0000000000 --- a/src/accl/graph/base/old/base_wl_engine.hh +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__ -#define __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__ - -#include -#include - -#include "accl/graph/base/base_engine.hh" -#include "accl/graph/base/util.hh" -#include "params/BaseWLEngine.hh" - -namespace gem5 -{ - -class BaseWLEngine : public BaseEngine -{ - private: - std::queue updateQueue; - std::queue responseQueue; - - std::unordered_map requestOffsetMap; - std::unordered_map requestValueMap; - - //Events - EventFunctionWrapper nextWLReadEvent; - void processNextWLReadEvent(); - /* Syncronously checked - If there are any active vertecies: - create memory read packets + MPU::MPU::MemPortsendTimingReq - */ - - EventFunctionWrapper nextWLReduceEvent; - void processNextWLReduceEvent(); - /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp - Perform apply and send the write request and read edgeList - read + write - Write edgelist loc in buffer - */ - protected: - virtual bool sendWLNotif(Addr addr) = 0; - virtual bool acquireAddress(Addr addr) = 0; - virtual bool releaseAddress(Addr addr) = 0; - virtual void scheduleMainEvent() override; - - public: - - PARAMS(BaseWLEngine); - - BaseWLEngine(const BaseWLEngineParams ¶ms); - - bool handleWLUpdate(PacketPtr pkt); -}; - -} - -#endif // __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__ diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 2cb9856f76..ff30efde4c 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -30,7 +30,7 @@ #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ #include "accl/graph/base/base_read_engine.hh" -#include "accl/graph/base/util.hh" +#include "accl/graph/base/data_structs.hh" #include "accl/graph/sega/push_engine.hh" #include "base/statistics.hh" #include "params/CoalesceEngine.hh" diff --git a/src/accl/graph/sega/old/ApplyEngine.py b/src/accl/graph/sega/old/ApplyEngine.py deleted file mode 100644 index 7a446bb620..0000000000 --- a/src/accl/graph/sega/old/ApplyEngine.py +++ /dev/null @@ -1,38 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (c) 2017 Jason Lowe-Power -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from m5.params import * -from m5.proxy import * -from m5.objects.BaseApplyEngine import BaseApplyEngine - -class ApplyEngine(BaseApplyEngine): - type = 'ApplyEngine' - cxx_header = "accl/graph/sega/apply_engine.hh" - cxx_class = 'gem5::ApplyEngine' - - push_engine = Param.PushEngine(Parent.any, "MPU object that owns this ApplyEngine") - lock_dir = Param.LockDirectory(NULL, "The lock directory to acquire locks from") diff --git a/src/accl/graph/sega/old/LockDir.py b/src/accl/graph/sega/old/LockDir.py deleted file mode 100644 index d21963dc3a..0000000000 --- a/src/accl/graph/sega/old/LockDir.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) 2012-2014, 2017-2018 ARM Limited -# All rights reserved. -# -# The license below extends only to copyright in the software and shall -# not be construed as granting a license to any other intellectual -# property including but not limited to intellectual property relating -# to a hardware implementation of the functionality of the software -# licensed hereunder. You may use the software subject to the license -# terms below provided that you ensure that this notice is replicated -# unmodified and in its entirety in all distributions of the software, -# modified or unmodified, in source code or in binary form. -# -# Copyright (c) 2007 The Regents of The University of Michigan -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from m5.params import * -from m5.proxy import * -from m5.SimObject import SimObject - -class LockDirectory(SimObject): - type = 'LockDirectory' - cxx_header = 'accl/graph/sega/lock_dir.hh' - cxx_class = 'gem5::LockDirectory' diff --git a/src/accl/graph/sega/old/PushEngine.py b/src/accl/graph/sega/old/PushEngine.py deleted file mode 100644 index a743b57262..0000000000 --- a/src/accl/graph/sega/old/PushEngine.py +++ /dev/null @@ -1,37 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (c) 2017 Jason Lowe-Power -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from m5.params import * -from m5.proxy import * -from m5.objects.BasePushEngine import BasePushEngine - -class PushEngine(BasePushEngine): - type = 'PushEngine' - cxx_header = "accl/graph/sega/push_engine.hh" - cxx_class = 'gem5::PushEngine' - - req_port = RequestPort("Port to send updates to the outside") diff --git a/src/accl/graph/sega/old/WLEngine.py b/src/accl/graph/sega/old/WLEngine.py deleted file mode 100644 index b6e697266e..0000000000 --- a/src/accl/graph/sega/old/WLEngine.py +++ /dev/null @@ -1,40 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (c) 2017 Jason Lowe-Power -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from m5.params import * -from m5.proxy import * -from m5.objects.BaseWLEngine import BaseWLEngine - -class WLEngine(BaseWLEngine): - type = 'WLEngine' - cxx_header = "accl/graph/sega/wl_engine.hh" - cxx_class = 'gem5::WLEngine' - - resp_port = ResponsePort("Port to Receive updates from outside") - apply_engine = Param.ApplyEngine(Parent.any, - "MPU object that owns this WLEngine") - lock_dir = Param.LockDirectory(NULL, "The lock directory to acquire locks from") diff --git a/src/accl/graph/sega/old/apply_engine.cc b/src/accl/graph/sega/old/apply_engine.cc deleted file mode 100644 index 544bb082ad..0000000000 --- a/src/accl/graph/sega/old/apply_engine.cc +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "accl/graph/sega/apply_engine.hh" - -namespace gem5{ - -ApplyEngine::ApplyEngine(const ApplyEngineParams ¶ms) : - BaseApplyEngine(params), - pushEngine(params.push_engine), - lockDir(params.lock_dir) -{} - -bool -ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) -{ - return pushEngine->recvApplyNotif(prop, degree, edgeIndex); - -} - -bool -ApplyEngine::acquireAddress(Addr addr) -{ - return lockDir->acquire(addr, requestorId); -} - -bool -ApplyEngine::releaseAddress(Addr addr) -{ - return lockDir->release(addr, requestorId); -} - -} diff --git a/src/accl/graph/sega/old/apply_engine.hh b/src/accl/graph/sega/old/apply_engine.hh deleted file mode 100644 index c88330487a..0000000000 --- a/src/accl/graph/sega/old/apply_engine.hh +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__ -#define __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__ - -#include -#include - -#include "accl/graph/base/base_apply_engine.hh" -#include "accl/graph/sega/lock_dir.hh" -#include "accl/graph/sega/push_engine.hh" -#include "mem/packet.hh" -#include "mem/port.hh" -#include "params/ApplyEngine.hh" -#include "sim/clocked_object.hh" -#include "sim/port.hh" - -namespace gem5 -{ - - -class ApplyEngine : public BaseApplyEngine -{ - private: - PushEngine* pushEngine; - LockDirectory* lockDir; - - protected: - virtual bool sendApplyNotif(uint32_t prop, - uint32_t degree, uint32_t edgeIndex) override; - virtual bool acquireAddress(Addr addr) override; - virtual bool releaseAddress(Addr addr) override; - - public: - PARAMS(ApplyEngine); - ApplyEngine(const ApplyEngineParams ¶ms); -}; - -} - -#endif // __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__ diff --git a/src/accl/graph/sega/old/lock_dir.cc b/src/accl/graph/sega/old/lock_dir.cc deleted file mode 100644 index 6a4496175d..0000000000 --- a/src/accl/graph/sega/old/lock_dir.cc +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "accl/graph/sega/lock_dir.hh" - -namespace gem5 -{ - -LockDirectory::LockDirectory(const LockDirectoryParams ¶ms) : - SimObject(params) -{} - -bool -LockDirectory::acquire(Addr addr, RequestorID requestorId) -{ - if (lockOwnerMap.find(addr) == lockOwnerMap.end()) { - lockOwnerMap[addr] = requestorId; - return true; - } else { - return false; - } -} - -bool -LockDirectory::release(Addr addr, RequestorID requestorId) -{ - if (lockOwnerMap.find(addr) == lockOwnerMap.end()) { - panic("Should not relase an address before acquiring"); - } else if (lockOwnerMap[addr] != requestorId) { - panic("Should not release and address you don't own"); - } else { - lockOwnerMap.erase(addr); - return true; - } - return false; -} - -} diff --git a/src/accl/graph/sega/old/lock_dir.hh b/src/accl/graph/sega/old/lock_dir.hh deleted file mode 100644 index 012334ce43..0000000000 --- a/src/accl/graph/sega/old/lock_dir.hh +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __ACCL_GRAPH_SEGA_LOCK_DIR_HH__ -#define __ACCL_GRAPH_SEGA_LOCK_DIR_HH__ - -#include - -#include "mem/packet.hh" -#include "params/LockDirectory.hh" -#include "sim/sim_object.hh" - -namespace gem5 -{ - -class LockDirectory: public SimObject -{ - private: - std::unordered_map lockOwnerMap; - // std::unordered_map lockDegreeMap; - - public: - PARAMS(LockDirectory); - LockDirectory(const LockDirectoryParams ¶ms); - - bool acquire(Addr addr, RequestorID requestorId); - bool release(Addr addr, RequestorID requestorId); -}; - -} - -#endif diff --git a/src/accl/graph/sega/old/push_engine.cc b/src/accl/graph/sega/old/push_engine.cc deleted file mode 100644 index c7b229ad33..0000000000 --- a/src/accl/graph/sega/old/push_engine.cc +++ /dev/null @@ -1,90 +0,0 @@ -/* - * Copyright (c) 2021 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "accl/graph/sega/push_engine.hh" - -namespace gem5 -{ - -PushEngine::PushEngine(const PushEngineParams ¶ms) : - BasePushEngine(params), - reqPort(name() + "reqPort", this) -{} - -Port& -PushEngine::getPort(const std::string &if_name, PortID idx) -{ - if (if_name == "req_port") { - return reqPort; - } else { - return BasePushEngine::getPort(if_name, idx); - } -} - -void -PushEngine::ReqPort::sendPacket(PacketPtr pkt) -{ - panic_if(_blocked, "Should never try to send if blocked MemSide!"); - // If we can't send the packet across the port, store it for later. - if (!sendTimingReq(pkt)) - { - blockedPacket = pkt; - _blocked = true; - } -} - -bool -PushEngine::ReqPort::recvTimingResp(PacketPtr pkt) -{ - panic("recvTimingResp called on the request port."); -} - -void -PushEngine::ReqPort::recvReqRetry() -{ - panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); - - _blocked = false; - sendPacket(blockedPacket); - - if (!blocked()) { - blockedPacket = nullptr; - } -} - -bool -PushEngine::sendPushUpdate(PacketPtr pkt) -{ - if (!reqPort.blocked()) { - reqPort.sendPacket(pkt); - return true; - } - return false; -} - -} diff --git a/src/accl/graph/sega/old/push_engine.hh b/src/accl/graph/sega/old/push_engine.hh deleted file mode 100644 index 604df4750d..0000000000 --- a/src/accl/graph/sega/old/push_engine.hh +++ /dev/null @@ -1,77 +0,0 @@ -/* - * Copyright (c) 2021 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ -#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ - -#include "accl/graph/base/base_push_engine.hh" -#include "params/PushEngine.hh" - -namespace gem5 -{ - -class MPU; - -class PushEngine : public BasePushEngine -{ - private: - class ReqPort : public RequestPort - { - private: - PushEngine* owner; - bool _blocked; - PacketPtr blockedPacket; - - public: - ReqPort(const std::string& name, PushEngine* owner) : - RequestPort(name, owner), owner(owner), - _blocked(false), blockedPacket(nullptr) - {} - void sendPacket(PacketPtr pkt); - bool blocked() { return _blocked; } - - protected: - virtual bool recvTimingResp(PacketPtr pkt); - virtual void recvReqRetry(); - }; - - ReqPort reqPort; - - protected: - virtual bool sendPushUpdate(PacketPtr pkt) override; - - public: - PARAMS(PushEngine); - PushEngine(const PushEngineParams ¶ms); - Port& getPort(const std::string &if_name, - PortID idx=InvalidPortID) override; -}; - -} - -#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ diff --git a/src/accl/graph/sega/old/wl_engine.cc b/src/accl/graph/sega/old/wl_engine.cc deleted file mode 100644 index 03f74f1019..0000000000 --- a/src/accl/graph/sega/old/wl_engine.cc +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "accl/graph/sega/wl_engine.hh" -#include "debug/MPU.hh" -namespace gem5 -{ - -WLEngine::WLEngine(const WLEngineParams ¶ms): - BaseWLEngine(params), - respPort(name() + ".respPort", this), - applyEngine(params.apply_engine), - lockDir(params.lock_dir) -{} - -Port& -WLEngine::getPort(const std::string &if_name, PortID idx) -{ - if (if_name == "resp_port") { - return respPort; - } else { - return BaseWLEngine::getPort(if_name, idx); - } -} - -void -WLEngine::startup() -{ - //FIXME: This is the current version of our initializer. - // This should be updated in the future. - WorkListItem vertices [5] = { - {10000, 10000, 3, 0}, // Addr: 0 - {10000, 10000, 1, 3}, // Addr: 16 - {10000, 10000, 1, 4}, // Addr: 32 - {10000, 10000, 1, 5}, // Addr: 48 - {10000, 10000, 0, 6} // Addr: 64 - }; - Edge edges [7] = { - {0, 16}, // Addr: 1048576 - {0, 32}, // Addr: 1048592 - {0, 48}, // Addr: 1048608 - {0, 32}, // Addr: 1048624 - {0, 64}, // Addr: 1048640 - {0, 32} - }; - - for (int i = 0; i < 5; i++) { - uint8_t* data = workListToMemory(vertices[i]); - PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem), - 16, data, 0); - sendMemFunctional(pkt); - } - - for (int i = 0; i < 7; i++) { - uint8_t* data = edgeToMemory(edges[i]); - PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge), - 16, data, 0); - sendMemFunctional(pkt); - } - - uint8_t* first_update_data = new uint8_t [4]; - uint32_t* tempPtr = (uint32_t*) first_update_data; - *tempPtr = 0; - - PacketPtr first_update = getUpdatePacket( - 0, 4, first_update_data, requestorId); - - handleWLUpdate(first_update); -} - -bool -WLEngine::sendWLNotif(Addr addr){ - return applyEngine->recvWLNotif(addr); -} - -AddrRangeList -WLEngine::RespPort::getAddrRanges() const -{ - return owner->getAddrRanges(); -} - -bool -WLEngine::RespPort::recvTimingReq(PacketPtr pkt) -{ - return owner->handleWLUpdate(pkt); -} - -Tick -WLEngine::RespPort::recvAtomic(PacketPtr pkt) -{ - panic("recvAtomic unimpl."); -} - -void -WLEngine::RespPort::recvFunctional(PacketPtr pkt) -{ - owner->recvFunctional(pkt); -} - -void -WLEngine::RespPort::recvRespRetry() -{ - panic("recvRespRetry from response port is called."); -} - -void -WLEngine::recvFunctional(PacketPtr pkt) -{ - // FIXME: This needs to be fixed - // if (pkt->cmd == MemCmd::UpdateWL) { - // panic("Functional requests should not be made to WL."); - // //TODO: Might be a good idea to implement later. - // // wlEngine->recvFunctional(pkt); - // } else { - sendMemFunctional(pkt); - // } -} - -bool -WLEngine::acquireAddress(Addr addr) -{ - return lockDir->acquire(addr, requestorId); -} - -bool -WLEngine::releaseAddress(Addr addr) -{ - return lockDir->release(addr, requestorId); -} - -} diff --git a/src/accl/graph/sega/old/wl_engine.hh b/src/accl/graph/sega/old/wl_engine.hh deleted file mode 100644 index 4e8a25795a..0000000000 --- a/src/accl/graph/sega/old/wl_engine.hh +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ -#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ - -#include -#include - -#include "accl/graph/base/base_wl_engine.hh" -#include "accl/graph/sega/apply_engine.hh" -#include "accl/graph/sega/lock_dir.hh" -#include "params/WLEngine.hh" - -namespace gem5 -{ - -class ApplyEngine; - -class WLEngine : public BaseWLEngine -{ - private: - class RespPort : public ResponsePort - { - private: - WLEngine* owner; - - public: - RespPort(const std::string& name, WLEngine* owner): - ResponsePort(name, owner), owner(owner) - {} - virtual AddrRangeList getAddrRanges() const; - - protected: - virtual bool recvTimingReq(PacketPtr pkt); - virtual Tick recvAtomic(PacketPtr pkt); - virtual void recvFunctional(PacketPtr pkt); - virtual void recvRespRetry(); - }; - - RespPort respPort; - ApplyEngine* applyEngine; - LockDirectory* lockDir; - - virtual void startup(); - void recvFunctional(PacketPtr pkt); - - protected: - virtual bool sendWLNotif(Addr addr) override; - virtual bool acquireAddress(Addr addr) override; - virtual bool releaseAddress(Addr addr) override; - - public: - PARAMS(WLEngine); - WLEngine(const WLEngineParams ¶ms); - Port& getPort(const std::string &if_name, - PortID idx=InvalidPortID) override; -}; - -} -#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 1b1a812d16..4c9822345f 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -30,7 +30,7 @@ #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ #include "accl/graph/base/base_read_engine.hh" -#include "accl/graph/base/util.hh" +#include "accl/graph/base/data_structs.hh" #include "params/PushEngine.hh" namespace gem5 diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index c1ef028f77..a8dff32d44 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -33,6 +33,7 @@ #include #include "accl/graph/base/base_reduce_engine.hh" +#include "accl/graph/base/data_structs.hh" #include "accl/graph/sega/coalesce_engine.hh" #include "base/statistics.hh" #include "params/WLEngine.hh" @@ -108,7 +109,7 @@ class WLEngine : public BaseReduceEngine bool handleIncomingUpdate(PacketPtr pkt); - virtual void handleIncomingWL(Addr addr, WorkListItem wl); + void handleIncomingWL(Addr addr, WorkListItem wl); }; } From 2d18a7b77fb6e0bdbb5d9fae5ef92ee9a3181311 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 1 Apr 2022 11:07:05 -0700 Subject: [PATCH 079/247] Fixing bugs. --- configs/accl/sega.py | 9 +-- src/accl/graph/sega/push_engine.cc | 110 +++++++++++++++-------------- src/accl/graph/sega/push_engine.hh | 6 +- src/accl/graph/sega/wl_engine.cc | 23 +++--- 4 files changed, 78 insertions(+), 70 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 680157ba7e..a0c7766fe0 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -5,11 +5,12 @@ class MPU(SubSystem): def __init__(self): super(MPU, self).__init__() self.push_engine = PushEngine(base_edge_addr=0x80000000, - push_req_queue_size = 16) + push_req_queue_size=16, + mem_resp_queue_size=8) self.coalesce_engine = CoalesceEngine( peer_push_engine=self.push_engine) self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, - update_queue_size = 16, + update_queue_size=16, on_the_fly_update_map_size=8) self.interconnect = SystemXBar() @@ -60,9 +61,9 @@ def __init__(self): self.mpu = MPU() self.mem_ctrl = MPUMemory( vertex_range=AddrRange(start=0x000000, size="2GiB"), - vertex_binary="epinions/graph_binaries/vertices", + vertex_binary="facebook/graph_binaries/vertices", edge_range=AddrRange(start=0x80000000, size="2GiB"), - edge_binary="epinions/graph_binaries/edgelist_0") + edge_binary="facebook/graph_binaries/edgelist_0") self.mpu.setReqPort(self.mpu.getRespPort()) self.mpu.setMemPort(self.mem_ctrl.getPort()) diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 450ba9ddc4..0b4c981d48 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -65,6 +65,7 @@ PushEngine::startup() *tempPtr = 0; PacketPtr first_update = createUpdatePacket(0, 4, first_update_data); + // PacketPtr first_update = createUpdatePacket(0, 4, (uint32_t) 0); sendPushUpdate(first_update); } @@ -109,7 +110,11 @@ PushEngine::recvWLItem(WorkListItem wl) return false; } - pushReqQueue.push_back(wl); + Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge)); + Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); + uint32_t update_value = wl.prop; + pushReqQueue.push_back( + std::make_pair(std::make_pair(start_addr, end_addr), update_value)); if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) { @@ -121,43 +126,36 @@ PushEngine::recvWLItem(WorkListItem wl) void PushEngine::processNextAddrGenEvent() { - WorkListItem wl = pushReqQueue.front(); - - std::vector addr_queue; - std::vector offset_queue; - std::vector num_edge_queue; - - for (uint32_t index = 0; index < wl.degree; index++) { - Addr edge_addr = baseEdgeAddr + (wl.edgeIndex + index) * sizeof(Edge); - Addr req_addr = (edge_addr / 64) * 64; - Addr req_offset = edge_addr % 64; - if (addr_queue.size()) { - if (addr_queue.back() == req_addr) { - num_edge_queue.back()++; - } - else { - addr_queue.push_back(req_addr); - offset_queue.push_back(req_offset); - num_edge_queue.push_back(1); - } - } - else { - addr_queue.push_back(req_addr); - offset_queue.push_back(req_offset); - num_edge_queue.push_back(1); - } - }; - - for (int index = 0; index < addr_queue.size(); index++) { - PacketPtr pkt = createReadPacket(addr_queue[index], 64); - reqOffsetMap[pkt->req] = offset_queue[index]; - reqNumEdgeMap[pkt->req] = num_edge_queue[index]; - reqValueMap[pkt->req] = wl.prop; - pendingReadReqs.push_back(pkt); + Addr start_addr, end_addr; + uint32_t update_value; + + std::pair, uint32_t> front = pushReqQueue.front(); + std::tie(start_addr, end_addr) = front.first; + update_value = front.second; + + Addr req_addr = (start_addr / 64) * 64; + Addr req_offset = start_addr % 64; + int num_edges = 0; + + if (end_addr > req_addr + 64) { + num_edges = (req_addr + 64 - start_addr) / sizeof(Edge); + } else { + num_edges = (end_addr - start_addr) / sizeof(Edge); } + PacketPtr pkt = createReadPacket(req_addr, 64); + reqOffsetMap[pkt->req] = req_offset; + reqNumEdgeMap[pkt->req] = num_edges; + reqValueMap[pkt->req] = update_value; + pendingReadReqs.push_back(pkt); pushReqQueue.pop_front(); + if (req_addr + 64 < end_addr) { + pushReqQueue.push_front( + std::make_pair(std::make_pair(req_addr + 64, end_addr), update_value) + ); + } + if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) { schedule(nextAddrGenEvent, nextCycle()); } @@ -207,26 +205,30 @@ PushEngine::processNextPushEvent() __func__, pkt->getAddr()); Addr offset = reqOffsetMap[req]; - int num_edges = reqNumEdgeMap[req]; uint32_t value = reqValueMap[req]; - for (int i = 0; i < num_edges; i++) { - uint8_t *curr_edge_data = data + offset + (i * sizeof(Edge)); - Edge* e = (Edge*) (curr_edge_data); - DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string()); - int data_size = sizeof(uint32_t) / sizeof(uint8_t); - uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]); - // TODO: Implement propagate function here - *update_data = value + 1; - DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n", - __func__, e->neighbor, *update_data); - PacketPtr update = createUpdatePacket(e->neighbor, - sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data); - - if (sendPushUpdate(update) && (i == num_edges - 1)) { - memRespQueue.pop_front(); - // TODO: Erase map entries here. - } + Edge* e = (Edge*) (data + offset); + DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string()); + int data_size = sizeof(uint32_t) / sizeof(uint8_t); + uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]); + // TODO: Implement propagate function here + *update_data = value + 1; + // uint32_t update_value = value + 1; + DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n", + __func__, e->neighbor, *update_data); + PacketPtr update = createUpdatePacket(e->neighbor, + sizeof(uint32_t), (uint8_t*) update_data); + + if (sendPushUpdate(update)) { + reqOffsetMap[req] = reqOffsetMap[req] + sizeof(Edge); + reqNumEdgeMap[req]--; + } + + if (reqNumEdgeMap[req] == 0) { + memRespQueue.pop_front(); + reqOffsetMap.erase(req); + reqNumEdgeMap.erase(req); + reqValueMap.erase(req); } if (!nextPushEvent.scheduled() && !memRespQueue.empty()) { @@ -235,7 +237,8 @@ PushEngine::processNextPushEvent() } PacketPtr -PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t *data) +PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data) +// PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value) { RequestPtr req = std::make_shared(addr, size, 0, _requestorId); // Dummy PC to have PC-based prefetchers latch on; get entropy into higher @@ -247,6 +250,7 @@ PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t *data) pkt->allocate(); pkt->setData(data); + // pkt->setLE(value); return pkt; } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 4c9822345f..faee5128b7 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -64,8 +64,9 @@ class PushEngine : public BaseReadEngine Addr baseEdgeAddr; int pushReqQueueSize; - std::deque pushReqQueue; + std::deque, uint32_t>> pushReqQueue; + // TODO: Add size one size for all these maps std::unordered_map reqOffsetMap; std::unordered_map reqNumEdgeMap; std::unordered_map reqValueMap; @@ -79,7 +80,8 @@ class PushEngine : public BaseReadEngine virtual void startup(); - PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t *data); + PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t* data); + // PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint32_t value); bool sendPushUpdate(PacketPtr pkt); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 73eacf945f..117abb61e8 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -106,17 +106,18 @@ WLEngine::processNextReadEvent() uint32_t* update_value = update->getPtr(); // FIXME: else logic is wrong - if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) && - (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) { - if (coalesceEngine->recvReadAddr(update_addr)) { - DPRINTF(MPU, "%s: Received an update and it's not been pulled in. " - "update_addr: %lu, update_value: %u.\n", - __func__, update_addr, *update_value); - onTheFlyUpdateMap[update_addr] = *update_value; - DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n", - __func__, update_addr, onTheFlyUpdateMap[update_addr]); - updateQueue.pop_front(); - DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size()); + if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) { + if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) { + if (coalesceEngine->recvReadAddr(update_addr)) { + DPRINTF(MPU, "%s: Received an update and it's not been pulled in. " + "update_addr: %lu, update_value: %u.\n", + __func__, update_addr, *update_value); + onTheFlyUpdateMap[update_addr] = *update_value; + DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n", + __func__, update_addr, onTheFlyUpdateMap[update_addr]); + updateQueue.pop_front(); + DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size()); + } } } else { // TODO: Generalize this to reduce function rather than just min From cf001ea9840f11ad2d78fa73c83cb5100039819a Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 3 Apr 2022 15:39:56 -0700 Subject: [PATCH 080/247] Updating createUpdatePacket. --- src/accl/graph/TODO.md | 8 ++++++++ src/accl/graph/sega/coalesce_engine.cc | 17 ++++------------- src/accl/graph/sega/coalesce_engine.hh | 1 + src/accl/graph/sega/push_engine.cc | 26 +++++++++++++------------- src/accl/graph/sega/push_engine.hh | 4 ++-- src/accl/graph/sega/wl_engine.cc | 14 ++++++++------ 6 files changed, 36 insertions(+), 34 deletions(-) create mode 100644 src/accl/graph/TODO.md diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md new file mode 100644 index 0000000000..d5effbeb96 --- /dev/null +++ b/src/accl/graph/TODO.md @@ -0,0 +1,8 @@ +# TODO Items + +* use setLE/setBE inside createUpdatePacket and createWritePacket +* parameterize cache size, associativity, maybe latencies, +and memory atom size in the coalesce engine +* look at all the simobjects and come up with a general architecture. Make +sure all the simobjects follow that architecture. +* implement all the communications between simobjects as req/retry. diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 9fed1e8230..8d97fffd20 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -30,6 +30,7 @@ #include "accl/graph/sega/wl_engine.hh" #include "debug/MPU.hh" +#include "mem/packet_access.hh" namespace gem5 { @@ -300,19 +301,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) // TODO: Make this more general and programmable. // && (cacheBlocks[block_index].hasConflict) - bool found = false; if ((cacheBlocks[block_index].takenMask == 0)) { - for (auto index : evictQueue) { - if (block_index == index) { - found = true; - break; - } - } - if (!found) { - evictQueue.push_back(block_index); - } - DPRINTF(MPU, "%s: evictQueue.size: %u.\n", - __func__, evictQueue.size()); + evictQueue.push_back(block_index); } if ((!nextApplyAndCommitEvent.scheduled()) && @@ -328,6 +318,7 @@ CoalesceEngine::processNextApplyAndCommitEvent() int block_index = evictQueue.front(); uint8_t changedMask = 0; // TODO: parameterize 64 to memory atom size + uint8_t* wl_data; uint8_t data[64]; for (int i = 0; i < 4; i++) { @@ -341,7 +332,7 @@ CoalesceEngine::processNextApplyAndCommitEvent() DPRINTF(MPU, "%s: Writing WorkListItem[%lu[%d]] to memory. " "WLItem: %s.\n", __func__, cacheBlocks[block_index].addr, i, cacheBlocks[block_index].items[i].to_string()); - uint8_t* wl_data = (uint8_t*) (cacheBlocks[block_index].items + i); + wl_data = (uint8_t*) (cacheBlocks[block_index].items + i); std::memcpy(data + (i * sizeof(WorkListItem)), wl_data, sizeof(WorkListItem)); } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index ff30efde4c..5c4e752cbf 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -84,6 +84,7 @@ class CoalesceEngine : public BaseReadEngine virtual void startup(); PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data); + // PacketPtr createWritePacket(Addr addr, unsigned int size, WorkListItem wl); EventFunctionWrapper nextMemReqEvent; void processNextMemReqEvent(); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 0b4c981d48..870b32f2fb 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -29,6 +29,7 @@ #include "accl/graph/sega/push_engine.hh" #include "debug/MPU.hh" +#include "mem/packet_access.hh" namespace gem5 { @@ -64,8 +65,8 @@ PushEngine::startup() uint32_t* tempPtr = (uint32_t*) first_update_data; *tempPtr = 0; - PacketPtr first_update = createUpdatePacket(0, 4, first_update_data); - // PacketPtr first_update = createUpdatePacket(0, 4, (uint32_t) 0); + // PacketPtr first_update = createUpdatePacket(0, 4, first_update_data); + PacketPtr first_update = createUpdatePacket(0, 4, (uint32_t) 0); sendPushUpdate(first_update); } @@ -193,7 +194,7 @@ PushEngine::handleMemResp(PacketPtr pkt) return true; } -// FIXME: FIX THIS FUNCTION FOR TIMING AND FUNCTIONAL ACCURACY. +// TODO: Add a parameter to allow for doing multiple pushes at the same time. void PushEngine::processNextPushEvent() { @@ -209,15 +210,14 @@ PushEngine::processNextPushEvent() Edge* e = (Edge*) (data + offset); DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string()); - int data_size = sizeof(uint32_t) / sizeof(uint8_t); - uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]); + // TODO: Implement propagate function here - *update_data = value + 1; - // uint32_t update_value = value + 1; + uint32_t update_value = value + 1; DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n", - __func__, e->neighbor, *update_data); + __func__, e->neighbor, update_value); + PacketPtr update = createUpdatePacket(e->neighbor, - sizeof(uint32_t), (uint8_t*) update_data); + sizeof(uint32_t), update_value); if (sendPushUpdate(update)) { reqOffsetMap[req] = reqOffsetMap[req] + sizeof(Edge); @@ -237,8 +237,8 @@ PushEngine::processNextPushEvent() } PacketPtr -PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data) -// PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value) +// PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data) +PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value) { RequestPtr req = std::make_shared(addr, size, 0, _requestorId); // Dummy PC to have PC-based prefetchers latch on; get entropy into higher @@ -249,8 +249,8 @@ PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data) PacketPtr pkt = new Packet(req, MemCmd::ReadReq); pkt->allocate(); - pkt->setData(data); - // pkt->setLE(value); + // pkt->setData(data); + pkt->setLE(value); return pkt; } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index faee5128b7..a539079ede 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -80,8 +80,8 @@ class PushEngine : public BaseReadEngine virtual void startup(); - PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t* data); - // PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint32_t value); + // PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t* data); + PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint32_t value); bool sendPushUpdate(PacketPtr pkt); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 117abb61e8..3a6911c1bf 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -27,7 +27,9 @@ */ #include "accl/graph/sega/wl_engine.hh" + #include "debug/MPU.hh" +#include "mem/packet_access.hh" namespace gem5 { @@ -103,7 +105,7 @@ WLEngine::processNextReadEvent() { PacketPtr update = updateQueue.front(); Addr update_addr = update->getAddr(); - uint32_t* update_value = update->getPtr(); + uint32_t update_value = update->getLE(); // FIXME: else logic is wrong if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) { @@ -111,8 +113,8 @@ WLEngine::processNextReadEvent() if (coalesceEngine->recvReadAddr(update_addr)) { DPRINTF(MPU, "%s: Received an update and it's not been pulled in. " "update_addr: %lu, update_value: %u.\n", - __func__, update_addr, *update_value); - onTheFlyUpdateMap[update_addr] = *update_value; + __func__, update_addr, update_value); + onTheFlyUpdateMap[update_addr] = update_value; DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n", __func__, update_addr, onTheFlyUpdateMap[update_addr]); updateQueue.pop_front(); @@ -123,10 +125,10 @@ WLEngine::processNextReadEvent() // TODO: Generalize this to reduce function rather than just min DPRINTF(MPU, "%s: Hitting in the onTheFlyUpdateMap." "update_addr: %lu, update_value: %u, old_value: %u.\n", - __func__, update_addr, *update_value, + __func__, update_addr, update_value, onTheFlyUpdateMap[update_addr]); onTheFlyUpdateMap[update_addr] = - std::min(*update_value, onTheFlyUpdateMap[update_addr]); + std::min(update_value, onTheFlyUpdateMap[update_addr]); stats.onTheFlyCoalesce++; updateQueue.pop_front(); DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size()); @@ -154,7 +156,6 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl) void WLEngine::processNextReduceEvent() { - std::unordered_map::iterator it = addrWorkListMap.begin(); @@ -190,6 +191,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt) } updateQueue.push_back(pkt); + assert(!updateQueue.empty()); DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size()); if (!nextReadEvent.scheduled()) { From c405e30aacbebd410d24fc83924f9769ea8e74f9 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 3 Apr 2022 17:26:15 -0700 Subject: [PATCH 081/247] Adding retry to wle respPort and debug. --- src/accl/graph/sega/push_engine.cc | 13 +++++++++---- src/accl/graph/sega/wl_engine.cc | 31 +++++++++++++++++++++++++----- src/accl/graph/sega/wl_engine.hh | 3 +++ 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 870b32f2fb..70d6242f5b 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -95,10 +95,12 @@ PushEngine::ReqPort::recvReqRetry() { panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); + DPRINTF(MPU, "%s: Received a reqRetry.\n", __func__); + _blocked = false; sendPacket(blockedPacket); - if (!blocked()) { + if (!_blocked) { blockedPacket = nullptr; } } @@ -202,12 +204,13 @@ PushEngine::processNextPushEvent() RequestPtr req = pkt->req; uint8_t *data = pkt->getPtr(); - DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu.\n", - __func__, pkt->getAddr()); - Addr offset = reqOffsetMap[req]; uint32_t value = reqValueMap[req]; + DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu, " + "offset: %lu\n", + __func__, pkt->getAddr(), offset); + Edge* e = (Edge*) (data + offset); DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string()); @@ -220,6 +223,8 @@ PushEngine::processNextPushEvent() sizeof(uint32_t), update_value); if (sendPushUpdate(update)) { + DPRINTF(MPU, "%s: Send a push update to addr: %lu with value: %d.\n", + __func__, e->neighbor, update_value); reqOffsetMap[req] = reqOffsetMap[req] + sizeof(Edge); reqNumEdgeMap[req]--; } diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 3a6911c1bf..27c7ad4fea 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -64,10 +64,25 @@ WLEngine::RespPort::getAddrRanges() const return owner->getAddrRanges(); } +void +WLEngine::RespPort::checkRetryReq() +{ + if (needSendRetryReq) { + DPRINTF(MPU, "%s: Sending a reqRetry.\n", __func__); + sendRetryReq(); + needSendRetryReq = false; + } +} + bool WLEngine::RespPort::recvTimingReq(PacketPtr pkt) { - return owner->handleIncomingUpdate(pkt); + if (!owner->handleIncomingUpdate(pkt)) { + needSendRetryReq = true; + return false; + } + + return true; } Tick @@ -107,7 +122,6 @@ WLEngine::processNextReadEvent() Addr update_addr = update->getAddr(); uint32_t update_value = update->getLE(); - // FIXME: else logic is wrong if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) { if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) { if (coalesceEngine->recvReadAddr(update_addr)) { @@ -118,7 +132,11 @@ WLEngine::processNextReadEvent() DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n", __func__, update_addr, onTheFlyUpdateMap[update_addr]); updateQueue.pop_front(); - DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size()); + DPRINTF(MPU, "%s: 0: updateQueue.size: %d.\n", __func__, updateQueue.size()); + if (updateQueue.size() == updateQueueSize - 1) { + respPort.checkRetryReq(); + } + } } } else { @@ -131,8 +149,10 @@ WLEngine::processNextReadEvent() std::min(update_value, onTheFlyUpdateMap[update_addr]); stats.onTheFlyCoalesce++; updateQueue.pop_front(); - DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size()); - // TODO: Add a stat to count the number of coalescions + DPRINTF(MPU, "%s: 1: updateQueue.size: %d.\n", __func__, updateQueue.size()); + if (updateQueue.size() == updateQueueSize - 1) { + respPort.checkRetryReq(); + } } // TODO: Only schedule nextReadEvent only when it has to be scheduled @@ -180,6 +200,7 @@ WLEngine::processNextReduceEvent() for (int i = 0; i < servicedAddresses.size(); i++) { onTheFlyUpdateMap.erase(servicedAddresses[i]); } + DPRINTF(MPU, "%s: onTheFlyUpdateMap.size(): %u, servicedAddresses.size(): %u.\n", __func__, onTheFlyUpdateMap.size(), servicedAddresses.size()); } bool diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index a8dff32d44..476c9be932 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -48,6 +48,7 @@ class WLEngine : public BaseReduceEngine { private: WLEngine* owner; + bool needSendRetryReq; public: RespPort(const std::string& name, WLEngine* owner): @@ -55,6 +56,8 @@ class WLEngine : public BaseReduceEngine {} virtual AddrRangeList getAddrRanges() const; + void checkRetryReq(); + protected: virtual bool recvTimingReq(PacketPtr pkt); virtual Tick recvAtomic(PacketPtr pkt); From f43564614cbf10d78bb23122e2242e657776ebef Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 5 Apr 2022 09:20:52 -0700 Subject: [PATCH 082/247] Debugging coalesce engine deadlock. --- src/accl/graph/base/data_structs.hh | 8 +- src/accl/graph/sega/coalesce_engine.cc | 247 ++++++++++++++++++++----- src/accl/graph/sega/coalesce_engine.hh | 2 + src/accl/graph/sega/push_engine.cc | 2 +- src/accl/graph/sega/wl_engine.cc | 71 ++++--- 5 files changed, 254 insertions(+), 76 deletions(-) diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index b51a9f0781..dacb74e38c 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -70,10 +70,10 @@ struct __attribute__ ((packed)) Edge uint16_t weight : 16; uint64_t neighbor : 48; - std::string to_string() - { - return csprintf("Edge{weight: %lu, neighbor: %lu}", weight, neighbor); - } + // std::string to_string() + // { + // return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor); + // } Edge(uint16_t weight, uint64_t neighbor): weight(weight), diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 8d97fffd20..d7fa806fff 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -41,6 +41,8 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): numMSHREntry(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size), + alarmRequested(false), + spaceRequested(0), nextMemReqEvent([this] { processNextMemReqEvent(); }, name()), nextRespondEvent([this] { processNextRespondEvent(); }, name()), nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()), @@ -77,17 +79,21 @@ CoalesceEngine::recvReadAddr(Addr addr) DPRINTF(MPU, "%s: Received a read request for address: %lu.\n", __func__, addr); Addr aligned_addr = (addr / 64) * 64; - int block_index = aligned_addr % 256; + int block_index = (aligned_addr / 64) % 256; int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); if ((cacheBlocks[block_index].addr == aligned_addr) && (cacheBlocks[block_index].valid)) { // Hit - DPRINTF(MPU, "%s: Read request with addr: %lu hit in the cache.\n" - , __func__, addr); // TODO: Make addrQueue and wlQueue into one std::pair addrResponseQueue.push_back(addr); - worklistResponseQueue.push_back(cacheBlocks[block_index].items[wl_offset]); + worklistResponseQueue.push_back( + cacheBlocks[block_index].items[wl_offset]); + DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s " + "to worklistResponseQueue. worklistResponseQueue.size = %d.\n", + __func__, addr, block_index, wl_offset, + worklistResponseQueue.size(), + cacheBlocks[block_index].items[wl_offset].to_string()); // TODO: Use a bitset instead of unsigned int for takenMask cacheBlocks[block_index].takenMask |= (1 << wl_offset); @@ -101,50 +107,72 @@ CoalesceEngine::recvReadAddr(Addr addr) return true; } else { // miss + DPRINTF(MPU, "%s: Addr: %lu is a miss.\n", __func__, addr); if (MSHRMap.find(block_index) == MSHRMap.end()) { + DPRINTF(MPU, "%s: Respective cache line[%d] for Addr: %lu not " + "found in MSHRs.\n", __func__, block_index, addr); assert(MSHRMap.size() <= numMSHREntry); if (MSHRMap.size() == numMSHREntry) { // Out of MSHR entries + DPRINTF(MPU, "%s: Out of MSHR entries. " + "Rejecting request.\n", __func__); return false; } else { + DPRINTF(MPU, "%s: MSHR entries available.\n", __func__); if (cacheBlocks[block_index].allocated) { assert(MSHRMap[block_index].size() <= numTgtsPerMSHR); + DPRINTF(MPU, "%s: Addr: %lu has a conflict " + "with Addr: %lu.\n", __func__, addr, + cacheBlocks[block_index].addr); if (MSHRMap[block_index].size() == numTgtsPerMSHR) { + DPRINTF(MPU, "%s: Out of targets for cache line[%d]. " + "Rejecting request.\n", + __func__, block_index); return false; } - // MSHR available but conflict - DPRINTF(MPU, "%s: Read request with addr: %lu missed with " - "conflict. Making a request for " - "aligned_addr: %lu.\n", - __func__, addr, aligned_addr); cacheBlocks[block_index].hasConflict = true; MSHRMap[block_index].push_back(addr); + DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache " + "line[%d]", __func__, addr, block_index); return true; } else { // TODO: Set valid to false every deallocation and - // assert valid == false here. + assert(!cacheBlocks[block_index].valid); // MSHR available and no conflict assert( outstandingMemReqQueue.size() <= outstandingMemReqQueueSize); + DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to " + "allocate a cache line for it.\n", + __func__, addr); if (outstandingMemReqQueue.size() == outstandingMemReqQueueSize) { + DPRINTF(MPU, "%s: No space in outstandingMemReqQueue " + "(outstandingMemReqQueue.size: %u). " + "Rejecting request.\n", __func__, + outstandingMemReqQueue.size()); return false; } - DPRINTF(MPU, "%s: Read request with addr: " - "%lu missed with no conflict. " - "Making a request for aligned_addr: %lu.\n" - , __func__, addr, aligned_addr); cacheBlocks[block_index].addr = aligned_addr; cacheBlocks[block_index].takenMask = 0; cacheBlocks[block_index].allocated = true; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = false; + DPRINTF(MPU, "%s: Allocated cache line[%d] for " + "Addr: %lu.\n", __func__, block_index, addr); MSHRMap[block_index].push_back(addr); + DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache " + "line[%d].\n", __func__, addr, block_index); // TODO: Parameterize 64 to memory atom size PacketPtr pkt = createReadPacket(aligned_addr, 64); + DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." + " req addr (aligned_addr) = %lu, size = 64.\n", + __func__, addr, aligned_addr); outstandingMemReqQueue.push_back(pkt); + DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue. " + "outstandingMemReqQueue.size = %d", __func__, + outstandingMemReqQueue.size()); stats.numVertexBlockReads++; @@ -156,14 +184,24 @@ CoalesceEngine::recvReadAddr(Addr addr) } } } else { + DPRINTF(MPU, "%s: Respective cache line[%d] for Addr: %lu already " + "in MSHRs.\n", __func__, block_index, addr); if (MSHRMap[block_index].size() == numTgtsPerMSHR) { + DPRINTF(MPU, "%s: Out of targets for cache line[%d]. " + "Rejecting request.\n", + __func__, block_index); return false; } if ((!cacheBlocks[block_index].hasConflict) && (aligned_addr != cacheBlocks[block_index].addr)) { + DPRINTF(MPU, "%s: Addr: %lu has a conflict " + "with Addr: %lu.\n", __func__, addr, + cacheBlocks[block_index].addr); cacheBlocks[block_index].hasConflict = true; } MSHRMap[block_index].push_back(addr); + DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache " + "line[%d].\n", __func__, addr, block_index); return true; } } @@ -176,9 +214,24 @@ CoalesceEngine::processNextMemReqEvent() if (!memPortBlocked()) { sendMemReq(pkt); + DPRINTF(MPU, "%s: Sent a memory request to Addr: %lu, MemCmd: %s.\n", + __func__, pkt->getAddr(), pkt->isRead() ? "Read" : "Write"); outstandingMemReqQueue.pop_front(); + DPRINTF(MPU, "%s: Popped a packet from outstandingMemReqQueue. " + "outstandingMemReqQueue.size = %u.\n", __func__, + outstandingMemReqQueue.size()); } + if ((alarmRequested) && + (outstandingMemReqQueue.size() < + (outstandingMemReqQueueSize - spaceRequested))) { + alarmRequested = false; + spaceRequested = 0; + schedule(nextApplyAndCommitEvent, nextCycle()); + DPRINTF(MPU, "%s: There is an alarm request for " + "nextApplyAndCommitEvent. Reset alarm parameters and scheduled " + "nextApplyAndCommitEvent.\n", __func__); + } if ((!nextMemReqEvent.scheduled()) && (!outstandingMemReqQueue.empty())) { schedule(nextMemReqEvent, nextCycle()); @@ -192,9 +245,14 @@ CoalesceEngine::processNextRespondEvent() WorkListItem worklist_response = worklistResponseQueue.front(); peerWLEngine->handleIncomingWL(addr_response, worklist_response); + DPRINTF(MPU, "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n", + __func__, worklist_response.to_string(), addr_response); addrResponseQueue.pop_front(); worklistResponseQueue.pop_front(); + DPRINTF(MPU, "%s: Popped a response from worklistResponseQueue. " + "worklistResponseQueue.size = %d.\n", __func__, + worklistResponseQueue.size()); if ((!nextRespondEvent.scheduled()) && (!worklistResponseQueue.empty()) && @@ -208,15 +266,20 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) { assert(pkt->isResponse()); if (pkt->isWrite()) { + DPRINTF(MPU, "%s: Received a write response for Addr: %lu. Dropping " + "the packet.\n", __func__, pkt->getAddr()); return true; } Addr addr = pkt->getAddr(); uint8_t* data = pkt->getPtr(); - int block_index = addr % 256; // TODO: After parameterizing the cache size - // this 256 number should change to the cache - // size parameter. + // TODO: After parameterizing the cache size + // this 256 number should change to the cache + // size parameter. + int block_index = (addr / 64) % 256; + DPRINTF(MPU, "%s: Received a read resposne for Addr: %lu.\n", + __func__, pkt->getAddr()); assert((cacheBlocks[block_index].allocated) && // allocated cache block (!cacheBlocks[block_index].valid) && // valid is false (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR @@ -224,6 +287,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) for (int i = 0; i < 4; i++) { cacheBlocks[block_index].items[i] = *((WorkListItem*) ( data + (i * sizeof(WorkListItem)))); + DPRINTF(MPU, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__, + block_index, i, cacheBlocks[block_index].items[i].to_string()); } cacheBlocks[block_index].valid = true; @@ -231,29 +296,42 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) std::vector servicedIndices; for (int i = 0; i < MSHRMap[block_index].size(); i++) { Addr miss_addr = MSHRMap[block_index][i]; - Addr alligned_miss_addr = (miss_addr / 64) * 64; + Addr aligned_miss_addr = (miss_addr / 64) * 64; - if (alligned_miss_addr == addr) { - int wl_offset = (miss_addr - alligned_miss_addr) / 16; + if (aligned_miss_addr == addr) { + int wl_offset = (miss_addr - aligned_miss_addr) / 16; + DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could " + "be serviced with the received packet.\n", + __func__, miss_addr, block_index); addrResponseQueue.push_back(miss_addr); worklistResponseQueue.push_back( cacheBlocks[block_index].items[wl_offset]); + DPRINTF(MPU, "%s: Pushed cache line[%d][%d] to " + "worklistResponseQueue. worklistResponseQueue.size = %u.\n" + , __func__, block_index, wl_offset, + worklistResponseQueue.size()); cacheBlocks[block_index].takenMask |= (1 << wl_offset); stats.numVertexReads++; servicedIndices.push_back(i); + DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for " + "removal.\n", __func__, i, block_index); } } // TODO: We Can use taken instead of this for (int i = 0; i < servicedIndices.size(); i++) { + Addr print_addr = MSHRMap[block_index][i - bias]; MSHRMap[block_index].erase(MSHRMap[block_index].begin() + servicedIndices[i] - bias); bias++; + DPRINTF(MPU, "%s: Addr: %lu has been serviced and is removed.\n", + __func__, print_addr); } if (MSHRMap[block_index].empty()) { MSHRMap.erase(block_index); cacheBlocks[block_index].hasConflict = false; } else { + // TODO: I think this is unnecessary. cacheBlocks[block_index].hasConflict = true; } @@ -286,27 +364,33 @@ void CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) { Addr aligned_addr = (addr / 64) * 64; - int block_index = aligned_addr % 256; + int block_index = (aligned_addr / 64) % 256; int wl_offset = (addr - aligned_addr) / 16; - DPRINTF(MPU, "%s: Recieved a WorkList write. addr: %lu, wl: %s.\n", - __func__, addr, wl.to_string()); - DPRINTF(MPU, "%s: aligned_addr: %lu, block_index: %d, wl_offset: %d, " - "takenMask: %u.\n", __func__, aligned_addr, - block_index, wl_offset, cacheBlocks[block_index].takenMask); + + DPRINTF(MPU, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n", + __func__, wl.to_string(), addr); assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) == (1 << wl_offset)); cacheBlocks[block_index].items[wl_offset] = wl; cacheBlocks[block_index].takenMask &= ~(1 << wl_offset); stats.numVertexWrites++; + DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, + cacheBlocks[block_index].items[wl_offset].to_string()); // TODO: Make this more general and programmable. // && (cacheBlocks[block_index].hasConflict) if ((cacheBlocks[block_index].takenMask == 0)) { + DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]." + " It does not have any taken items anymore.\n", + __func__, block_index); evictQueue.push_back(block_index); + DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n", + __func__, block_index, evictQueue.size()); } if ((!nextApplyAndCommitEvent.scheduled()) && - (!evictQueue.empty())) { + (!evictQueue.empty())&& + ((!alarmRequested) && (spaceRequested == 0))) { schedule(nextApplyAndCommitEvent, nextCycle()); } @@ -315,90 +399,163 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) void CoalesceEngine::processNextApplyAndCommitEvent() { + assert((!alarmRequested) && (spaceRequested == 0)); int block_index = evictQueue.front(); uint8_t changedMask = 0; // TODO: parameterize 64 to memory atom size uint8_t* wl_data; uint8_t data[64]; + DPRINTF(MPU, "%s: Received nextApplyAndCommitEvent for cache line[%d].\n", + __func__, block_index); + DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and " + "then commited.\n", __func__, block_index); + + if ((cacheBlocks[block_index].hasConflict) && + (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)) { + DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n", + __func__, block_index); + + } else if ((!cacheBlocks[block_index].hasConflict) && + (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) { + DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n", + __func__, block_index); + } else { + alarmRequested = true; + spaceRequested = cacheBlocks[block_index].hasConflict ? 2 : 1; + DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set " + "an alarm for nextApplyAndCommitEvent when space = %d.\n", + __func__, spaceRequested); + return; + } + for (int i = 0; i < 4; i++) { uint32_t old_prop = cacheBlocks[block_index].items[i].prop; cacheBlocks[block_index].items[i].prop = std::min( cacheBlocks[block_index].items[i].prop, cacheBlocks[block_index].items[i].tempProp); + DPRINTF(MPU, "%s: Applied cache line[%d][%d] = %s.\n", __func__, + block_index, i, + cacheBlocks[block_index].items[i].to_string()); if (old_prop != cacheBlocks[block_index].items[i].prop) { changedMask |= (1 << i); + DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n", + __func__, block_index, i); } - DPRINTF(MPU, "%s: Writing WorkListItem[%lu[%d]] to memory. " - "WLItem: %s.\n", __func__, cacheBlocks[block_index].addr, - i, cacheBlocks[block_index].items[i].to_string()); wl_data = (uint8_t*) (cacheBlocks[block_index].items + i); std::memcpy(data + (i * sizeof(WorkListItem)), wl_data, sizeof(WorkListItem)); } if (changedMask) { + DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n" + , __func__, block_index); assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize); PacketPtr write_pkt = createWritePacket( cacheBlocks[block_index].addr, 64, data); - - if ((cacheBlocks[block_index].hasConflict) && - (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)){ + DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n", + __func__, write_pkt->getAddr()); + if (cacheBlocks[block_index].hasConflict) { + assert( + outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1 + ); + DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is " + "enough space in outstandingMemReqQueue for the write " + "back packet and its subsequent read packet.\n", + __func__, block_index); Addr miss_addr = MSHRMap[block_index][0]; - // TODO: Make sure this trick works; - Addr alligned_miss_addr = (miss_addr / 64) * 64; - PacketPtr read_pkt = createReadPacket(alligned_miss_addr, 64); + DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is" + " Addr: %lu.\n", __func__, block_index, miss_addr); + // TODO: parameterize 64 + Addr aligned_miss_addr = (miss_addr / 64) * 64; + PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64); + DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." + " req addr (aligned_addr) = %lu, size = 64.\n", + __func__, miss_addr, aligned_miss_addr); outstandingMemReqQueue.push_back(write_pkt); outstandingMemReqQueue.push_back(read_pkt); + DPRINTF(MPU, "%s: Added the evicting write back packet along with " + "its subsequent read packet (to service the conflicts)" + " to outstandingMemReqQueue. " + "outstandingMemReqQueue.size = %u.\n", __func__, + outstandingMemReqQueue.size()); // TODO: This should be improved if ((changedMask & (1)) == 1) { peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, 0); } if ((changedMask & (2)) == 2) { peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, 1); } if ((changedMask & (4)) == 4) { peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, 2); } if ((changedMask & (8)) == 8) { peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, 3); } cacheBlocks[block_index].takenMask = 0; cacheBlocks[block_index].allocated = true; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = true; evictQueue.pop_front(); - DPRINTF(MPU, "%s: evictQueue.size: %u.\n", - __func__, evictQueue.size()); - } else if ((!cacheBlocks[block_index].hasConflict) && - (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) { + DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size " + " = %u.\n", __func__, evictQueue.size()); + } else { + assert(outstandingMemReqQueue.size() < outstandingMemReqQueueSize); + DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is " + "enough space in outstandingMemReqQueue for the write back" + " packet.\n", __func__, block_index); outstandingMemReqQueue.push_back(write_pkt); + DPRINTF(MPU, "%s: Added the write back packet to " + "outstandingMemReqQueue. oustandingMemReqQueue.size = " + "%u.\n", __func__, outstandingMemReqQueue.size()); // TODO: This should be improved if ((changedMask & (1)) == 1) { peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, 0); } if ((changedMask & (2)) == 2) { peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, 1); } if ((changedMask & (4)) == 4) { peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, 2); } if ((changedMask & (8)) == 8) { peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, 3); } cacheBlocks[block_index].takenMask = 0; cacheBlocks[block_index].allocated = false; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = false; evictQueue.pop_front(); - DPRINTF(MPU, "%s: evictQueue.size: %u.\n", - __func__, evictQueue.size()); - } else { - DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" , - __func__); + DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size " + " = %u.\n", __func__, evictQueue.size()); } } else { + cacheBlocks[block_index].takenMask = 0; + cacheBlocks[block_index].allocated = false; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].hasConflict = false; + DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write " + "backs are necessary. Deallocated cache line[%d].\n", + __func__, block_index, block_index); evictQueue.pop_front(); + DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size " + " = %u.\n", __func__, evictQueue.size()); } if ((!nextMemReqEvent.scheduled()) && diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 5c4e752cbf..902a960301 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -74,6 +74,8 @@ class CoalesceEngine : public BaseReadEngine std::unordered_map> MSHRMap; int outstandingMemReqQueueSize; + bool alarmRequested; + int spaceRequested; std::deque outstandingMemReqQueue; std::deque addrResponseQueue; diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 70d6242f5b..c9ed781d79 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -212,7 +212,7 @@ PushEngine::processNextPushEvent() __func__, pkt->getAddr(), offset); Edge* e = (Edge*) (data + offset); - DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string()); + // DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string()); // TODO: Implement propagate function here uint32_t update_value = value + 1; diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 27c7ad4fea..ea45cae652 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -68,7 +68,7 @@ void WLEngine::RespPort::checkRetryReq() { if (needSendRetryReq) { - DPRINTF(MPU, "%s: Sending a reqRetry.\n", __func__); + DPRINTF(MPU, "%s: Sending a RetryReq.\n", __func__); sendRetryReq(); needSendRetryReq = false; } @@ -121,43 +121,49 @@ WLEngine::processNextReadEvent() PacketPtr update = updateQueue.front(); Addr update_addr = update->getAddr(); uint32_t update_value = update->getLE(); + DPRINTF(MPU, "%s: Looking at the front of the updateQueue. Addr: %lu, " + "value: %u.\n", __func__, update_addr, update_value); if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) { + DPRINTF(MPU, "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n", + __func__, update_addr); if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) { if (coalesceEngine->recvReadAddr(update_addr)) { - DPRINTF(MPU, "%s: Received an update and it's not been pulled in. " - "update_addr: %lu, update_value: %u.\n", - __func__, update_addr, update_value); onTheFlyUpdateMap[update_addr] = update_value; - DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n", - __func__, update_addr, onTheFlyUpdateMap[update_addr]); + DPRINTF(MPU, "%s: Added a new item to onTheFlyUpdateMap. " + "onTheFlyUpdateMap[%lu] = %u.\n", __func__, + update_addr, onTheFlyUpdateMap[update_addr]); updateQueue.pop_front(); - DPRINTF(MPU, "%s: 0: updateQueue.size: %d.\n", __func__, updateQueue.size()); + DPRINTF(MPU, "%s: Popped an item from the front of updateQueue" + ". updateQueue.size = %u.\n", + __func__, updateQueue.size()); if (updateQueue.size() == updateQueueSize - 1) { respPort.checkRetryReq(); } - } } } else { // TODO: Generalize this to reduce function rather than just min - DPRINTF(MPU, "%s: Hitting in the onTheFlyUpdateMap." - "update_addr: %lu, update_value: %u, old_value: %u.\n", - __func__, update_addr, update_value, - onTheFlyUpdateMap[update_addr]); + DPRINTF(MPU, "%s: Found the addr: %lu in onTheFlyUpdateMap. " + "onTheFlyUpdateMap[%lu] = %u.", __func__, update_addr, + update_addr, onTheFlyUpdateMap[update_addr]); onTheFlyUpdateMap[update_addr] = std::min(update_value, onTheFlyUpdateMap[update_addr]); + DPRINTF(MPU, "%s: Reduced the update_value with the entry in " + "onTheFlyUpdateMap. onTheFlyUpdateMap[%lu] = %u.\n", + __func__, update_addr, onTheFlyUpdateMap[update_addr]); stats.onTheFlyCoalesce++; updateQueue.pop_front(); - DPRINTF(MPU, "%s: 1: updateQueue.size: %d.\n", __func__, updateQueue.size()); + DPRINTF(MPU, "%s: Popped an item from the front of updateQueue" + ". updateQueue.size = %u.\n", + __func__, updateQueue.size()); if (updateQueue.size() == updateQueueSize - 1) { respPort.checkRetryReq(); } } // TODO: Only schedule nextReadEvent only when it has to be scheduled - if ((!nextReadEvent.scheduled()) && - (!updateQueue.empty())) { + if ((!nextReadEvent.scheduled()) && (!updateQueue.empty())) { schedule(nextReadEvent, nextCycle()); } } @@ -166,9 +172,14 @@ void WLEngine::handleIncomingWL(Addr addr, WorkListItem wl) { assert(addrWorkListMap.size() <= onTheFlyUpdateMapSize); + addrWorkListMap[addr] = wl; - // TODO: Add checks to see if scheduling is necessary or correct. - if ((!nextReduceEvent.scheduled()) && (!addrWorkListMap.empty())) { + DPRINTF(MPU, "%s: Received a WorkListItem from the coalesceEngine. Adding" + " it to the addrWorkListMap. addrWorkListMap[%lu] = %s.\n", + __func__, addr, wl.to_string()); + + assert(!addrWorkListMap.empty()); + if (!nextReduceEvent.scheduled()) { schedule(nextReduceEvent, nextCycle()); } } @@ -182,25 +193,32 @@ WLEngine::processNextReduceEvent() std::vector servicedAddresses; while (it != addrWorkListMap.end()) { Addr addr = it->first; - WorkListItem wl = it->second; uint32_t update_value = onTheFlyUpdateMap[addr]; - DPRINTF(MPU, "%s: updating WorkList[%lu] with the current temp_prop: " - "%d, with new update: %d.\n", __func__, addr, wl.tempProp, - onTheFlyUpdateMap[addr]); + DPRINTF(MPU, "%s: Reducing between onTheFlyUpdateMap and " + "addrWorkListMap values. onTheFlyUpdateMap[%lu] = %u, " + "addrWorkListMap[%lu] = %s.\n", __func__, + addr, onTheFlyUpdateMap[addr], + addr, addrWorkListMap[addr].to_string()); // TODO: Generalize this to reduce function rather than just min - wl.tempProp = std::min(update_value, wl.tempProp); + addrWorkListMap[addr].tempProp = + std::min(update_value, addrWorkListMap[addr].tempProp); + DPRINTF(MPU, "%s: Reduction done. addrWorkListMap[%lu] = %s.\n", + __func__, addr, addrWorkListMap[addr].to_string()); stats.numReduce++; - coalesceEngine->recvWLWrite(addr, wl); + coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]); servicedAddresses.push_back(addr); + DPRINTF(MPU, "%s: Added addr: %lu to servicedAdresses.\n", + __func__, addr); it++; } addrWorkListMap.clear(); for (int i = 0; i < servicedAddresses.size(); i++) { onTheFlyUpdateMap.erase(servicedAddresses[i]); + DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n", + __func__, servicedAddresses[i]); } - DPRINTF(MPU, "%s: onTheFlyUpdateMap.size(): %u, servicedAddresses.size(): %u.\n", __func__, onTheFlyUpdateMap.size(), servicedAddresses.size()); } bool @@ -212,9 +230,10 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt) } updateQueue.push_back(pkt); - + DPRINTF(MPU, "%s: Pushed an item to the front of updateQueue" + ". updateQueue.size = %u.\n", + __func__, updateQueue.size()); assert(!updateQueue.empty()); - DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size()); if (!nextReadEvent.scheduled()) { schedule(nextReadEvent, nextCycle()); } From 8195339d419b32284f92f4c14395efc58a245604 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 7 Apr 2022 15:06:58 -0700 Subject: [PATCH 083/247] Restructing inheritance and fixiing inf queue. --- configs/accl/sega.py | 6 +- src/accl/graph/TODO.md | 1 + src/accl/graph/base/BaseReadEngine.py | 3 + src/accl/graph/base/base_read_engine.cc | 83 ++++++++ src/accl/graph/base/base_read_engine.hh | 18 +- src/accl/graph/base/data_structs.hh | 8 +- src/accl/graph/sega/CoalesceEngine.py | 3 +- src/accl/graph/sega/PushEngine.py | 1 - src/accl/graph/sega/coalesce_engine.cc | 254 ++++++++++++------------ src/accl/graph/sega/coalesce_engine.hh | 16 +- src/accl/graph/sega/push_engine.cc | 142 ++++++------- src/accl/graph/sega/push_engine.hh | 55 +++-- src/accl/graph/sega/wl_engine.cc | 10 +- src/accl/graph/sega/wl_engine.hh | 2 +- 14 files changed, 348 insertions(+), 254 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index a0c7766fe0..8e24280366 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -2,9 +2,9 @@ from m5.objects import * class MPU(SubSystem): - def __init__(self): + def __init__(self, base_edge_addr): super(MPU, self).__init__() - self.push_engine = PushEngine(base_edge_addr=0x80000000, + self.push_engine = PushEngine(base_edge_addr=base_edge_addr, push_req_queue_size=16, mem_resp_queue_size=8) self.coalesce_engine = CoalesceEngine( @@ -58,7 +58,7 @@ def __init__(self): self.clk_domain.clock = '1GHz' self.clk_domain.voltage_domain = VoltageDomain() - self.mpu = MPU() + self.mpu = MPU(base_edge_addr=0x80000000) self.mem_ctrl = MPUMemory( vertex_range=AddrRange(start=0x000000, size="2GiB"), vertex_binary="facebook/graph_binaries/vertices", diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md index d5effbeb96..a0e2cefeff 100644 --- a/src/accl/graph/TODO.md +++ b/src/accl/graph/TODO.md @@ -6,3 +6,4 @@ and memory atom size in the coalesce engine * look at all the simobjects and come up with a general architecture. Make sure all the simobjects follow that architecture. * implement all the communications between simobjects as req/retry. +* get rid of maps with RequestPtr as keys diff --git a/src/accl/graph/base/BaseReadEngine.py b/src/accl/graph/base/BaseReadEngine.py index 84c53465b9..3ddab2d3c4 100644 --- a/src/accl/graph/base/BaseReadEngine.py +++ b/src/accl/graph/base/BaseReadEngine.py @@ -37,3 +37,6 @@ class BaseReadEngine(ClockedObject): system = Param.System(Parent.any, 'System this Engine is a part of') mem_port = RequestPort("Port to communicate with the memory") + + outstanding_mem_req_queue_size = Param.Int(16, "Capacity of queue in " + "which memory requests are queued.") diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc index a32237db35..e3b588cfc6 100644 --- a/src/accl/graph/base/base_read_engine.cc +++ b/src/accl/graph/base/base_read_engine.cc @@ -28,6 +28,7 @@ #include "accl/graph/base/base_read_engine.hh" +#include "debug/MPU.hh" namespace gem5 { @@ -35,6 +36,10 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams ¶ms): ClockedObject(params), system(params.system), memPort(name() + ".mem_port", this), + outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size), + alarmRequested(false), + spaceRequested(0), + nextMemReqEvent([this] { processNextMemReqEvent(); }, name()), _requestorId(system->getRequestorId(this)) {} @@ -83,6 +88,31 @@ BaseReadEngine::MemPort::recvReqRetry() } } +void +BaseReadEngine::processNextMemReqEvent() +{ + if (memPort.blocked()) { + return; + } + + // TODO: Maybe add a DPRINTF here. + PacketPtr pkt = outstandingMemReqQueue.front(); + memPort.sendPacket(pkt); + outstandingMemReqQueue.pop_front(); + + if (alarmRequested && + (outstandingMemReqQueue.size() <= + (outstandingMemReqQueueSize - spaceRequested))) { + alarmRequested = false; + spaceRequested = 0; + respondToAlarm(); + } + + if ((!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) { + schedule(nextMemReqEvent, nextCycle()); + } +} + PacketPtr BaseReadEngine::createReadPacket(Addr addr, unsigned int size) { @@ -98,4 +128,57 @@ BaseReadEngine::createReadPacket(Addr addr, unsigned int size) return pkt; } +PacketPtr +BaseReadEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data) +{ + RequestPtr req = std::make_shared(addr, size, 0, _requestorId); + + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) _requestorId) << 2); + + PacketPtr pkt = new Packet(req, MemCmd::WriteReq); + pkt->allocate(); + pkt->setData(data); + + return pkt; +} + +bool +BaseReadEngine::memReqQueueHasSpace(int space) +{ + assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize); + return ( + outstandingMemReqQueue.size() <= (outstandingMemReqQueueSize - space) + ); +} + +bool +BaseReadEngine::memReqQueueFull() +{ + assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize); + return (outstandingMemReqQueue.size() == outstandingMemReqQueueSize); +} + +void +BaseReadEngine::enqueueMemReq(PacketPtr pkt) +{ + panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n"); + outstandingMemReqQueue.push_back(pkt); + + assert(!outstandingMemReqQueue.empty()); + if (!nextMemReqEvent.scheduled()) { + schedule(nextMemReqEvent, nextCycle()); + } +} + +void +BaseReadEngine::requestAlarm(int space) { + panic_if((alarmRequested == true) || (spaceRequested != 0), + "You should not request another alarm without the first one being" + "responded to.\n"); + alarmRequested = true; + spaceRequested = space; +} + } diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh index e21aaa01d2..bec922beef 100644 --- a/src/accl/graph/base/base_read_engine.hh +++ b/src/accl/graph/base/base_read_engine.hh @@ -68,16 +68,30 @@ class BaseReadEngine : public ClockedObject System* system; MemPort memPort; + int outstandingMemReqQueueSize; + bool alarmRequested; + int spaceRequested; + std::deque outstandingMemReqQueue; + + EventFunctionWrapper nextMemReqEvent; + void processNextMemReqEvent(); + protected: const RequestorID _requestorId; - bool memPortBlocked() { return memPort.blocked(); } - void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); } void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } + bool memReqQueueHasSpace(int space); + bool memReqQueueFull(); + void enqueueMemReq(PacketPtr pkt); + bool pendingAlarm() { return alarmRequested; } + void requestAlarm(int space); + + virtual void respondToAlarm() = 0; virtual bool handleMemResp(PacketPtr pkt) = 0; PacketPtr createReadPacket(Addr addr, unsigned int size); + PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data); public: PARAMS(BaseReadEngine); diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index dacb74e38c..28a503528f 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -70,10 +70,10 @@ struct __attribute__ ((packed)) Edge uint16_t weight : 16; uint64_t neighbor : 48; - // std::string to_string() - // { - // return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor); - // } + std::string to_string() + { + return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor); + } Edge(uint16_t weight, uint64_t neighbor): weight(weight), diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index 0330da7576..bec7e3d233 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -33,8 +33,7 @@ class CoalesceEngine(BaseReadEngine): type = 'CoalesceEngine' cxx_header = "accl/graph/sega/coalesce_engine.hh" cxx_class = 'gem5::CoalesceEngine' - + peer_push_engine = Param.PushEngine(NULL, "") num_mshr_entry = Param.Int(4, "") num_tgts_per_mshr = Param.Int(20, "") - outstanding_mem_req_queue_size = Param.Int(20, "") diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py index 129d9454c7..645bc5f4ea 100644 --- a/src/accl/graph/sega/PushEngine.py +++ b/src/accl/graph/sega/PushEngine.py @@ -36,5 +36,4 @@ class PushEngine(BaseReadEngine): req_port = RequestPort("Port to send updates to the outside") base_edge_addr = Param.Addr("") - mem_resp_queue_size = Param.Int(0, "") push_req_queue_size = Param.Int(0, "") diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index d7fa806fff..015629245b 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -40,10 +40,6 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): peerPushEngine(params.peer_push_engine), numMSHREntry(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), - outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size), - alarmRequested(false), - spaceRequested(0), - nextMemReqEvent([this] { processNextMemReqEvent(); }, name()), nextRespondEvent([this] { processNextRespondEvent(); }, name()), nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()), stats(*this) @@ -85,14 +81,11 @@ CoalesceEngine::recvReadAddr(Addr addr) if ((cacheBlocks[block_index].addr == aligned_addr) && (cacheBlocks[block_index].valid)) { // Hit - // TODO: Make addrQueue and wlQueue into one std::pair - addrResponseQueue.push_back(addr); - worklistResponseQueue.push_back( - cacheBlocks[block_index].items[wl_offset]); + responseQueue.push_back(std::make_tuple(addr, + cacheBlocks[block_index].items[wl_offset])); DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s " - "to worklistResponseQueue. worklistResponseQueue.size = %d.\n", - __func__, addr, block_index, wl_offset, - worklistResponseQueue.size(), + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, block_index, wl_offset, responseQueue.size(), cacheBlocks[block_index].items[wl_offset].to_string()); // TODO: Use a bitset instead of unsigned int for takenMask cacheBlocks[block_index].takenMask |= (1 << wl_offset); @@ -100,7 +93,7 @@ CoalesceEngine::recvReadAddr(Addr addr) stats.readHits++; stats.numVertexReads++; - assert(!worklistResponseQueue.empty() && !addrResponseQueue.empty()); + assert(!responseQueue.empty()); if (!nextRespondEvent.scheduled()) { schedule(nextRespondEvent, nextCycle()); } @@ -136,21 +129,18 @@ CoalesceEngine::recvReadAddr(Addr addr) "line[%d]", __func__, addr, block_index); return true; } else { - // TODO: Set valid to false every deallocation and assert(!cacheBlocks[block_index].valid); // MSHR available and no conflict - assert( - outstandingMemReqQueue.size() <= - outstandingMemReqQueueSize); + //TODO: Fix this to work with new inheritance. + // assert( + // outstandingMemReqQueue.size() <= + // outstandingMemReqQueueSize); DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to " "allocate a cache line for it.\n", __func__, addr); - if (outstandingMemReqQueue.size() == - outstandingMemReqQueueSize) { - DPRINTF(MPU, "%s: No space in outstandingMemReqQueue " - "(outstandingMemReqQueue.size: %u). " - "Rejecting request.\n", __func__, - outstandingMemReqQueue.size()); + if (memReqQueueFull()) { + DPRINTF(MPU, "%s: No space in outstandingMemReqQueue. " + "Rejecting request.\n", __func__); return false; } cacheBlocks[block_index].addr = aligned_addr; @@ -169,17 +159,10 @@ CoalesceEngine::recvReadAddr(Addr addr) DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." " req addr (aligned_addr) = %lu, size = 64.\n", __func__, addr, aligned_addr); - outstandingMemReqQueue.push_back(pkt); - DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue. " - "outstandingMemReqQueue.size = %d", __func__, - outstandingMemReqQueue.size()); - + enqueueMemReq(pkt); + DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue.\n", + __func__); stats.numVertexBlockReads++; - - assert(!outstandingMemReqQueue.empty()); - if (!nextMemReqEvent.scheduled()) { - schedule(nextMemReqEvent, nextCycle()); - } return true; } } @@ -207,65 +190,41 @@ CoalesceEngine::recvReadAddr(Addr addr) } } -void -CoalesceEngine::processNextMemReqEvent() -{ - PacketPtr pkt = outstandingMemReqQueue.front(); - - if (!memPortBlocked()) { - sendMemReq(pkt); - DPRINTF(MPU, "%s: Sent a memory request to Addr: %lu, MemCmd: %s.\n", - __func__, pkt->getAddr(), pkt->isRead() ? "Read" : "Write"); - outstandingMemReqQueue.pop_front(); - DPRINTF(MPU, "%s: Popped a packet from outstandingMemReqQueue. " - "outstandingMemReqQueue.size = %u.\n", __func__, - outstandingMemReqQueue.size()); - } - - if ((alarmRequested) && - (outstandingMemReqQueue.size() < - (outstandingMemReqQueueSize - spaceRequested))) { - alarmRequested = false; - spaceRequested = 0; - schedule(nextApplyAndCommitEvent, nextCycle()); - DPRINTF(MPU, "%s: There is an alarm request for " - "nextApplyAndCommitEvent. Reset alarm parameters and scheduled " - "nextApplyAndCommitEvent.\n", __func__); - } - if ((!nextMemReqEvent.scheduled()) && - (!outstandingMemReqQueue.empty())) { - schedule(nextMemReqEvent, nextCycle()); - } -} - void CoalesceEngine::processNextRespondEvent() { - Addr addr_response = addrResponseQueue.front(); - WorkListItem worklist_response = worklistResponseQueue.front(); + Addr addr_response; + WorkListItem worklist_response; + std::tie(addr_response, worklist_response) = responseQueue.front(); peerWLEngine->handleIncomingWL(addr_response, worklist_response); DPRINTF(MPU, "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n", __func__, worklist_response.to_string(), addr_response); - addrResponseQueue.pop_front(); - worklistResponseQueue.pop_front(); + responseQueue.pop_front(); DPRINTF(MPU, "%s: Popped a response from worklistResponseQueue. " "worklistResponseQueue.size = %d.\n", __func__, - worklistResponseQueue.size()); + responseQueue.size()); if ((!nextRespondEvent.scheduled()) && - (!worklistResponseQueue.empty()) && - (!addrResponseQueue.empty())) { + (!responseQueue.empty())) { schedule(nextRespondEvent, nextCycle()); } } +void +CoalesceEngine::respondToAlarm() +{ + assert(!nextApplyAndCommitEvent.scheduled()); + schedule(nextApplyAndCommitEvent, nextCycle()); +} + bool CoalesceEngine::handleMemResp(PacketPtr pkt) { assert(pkt->isResponse()); if (pkt->isWrite()) { + delete pkt; DPRINTF(MPU, "%s: Received a write response for Addr: %lu. Dropping " "the packet.\n", __func__, pkt->getAddr()); return true; @@ -291,6 +250,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) block_index, i, cacheBlocks[block_index].items[i].to_string()); } cacheBlocks[block_index].valid = true; + delete pkt; int bias = 0; std::vector servicedIndices; @@ -303,13 +263,12 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could " "be serviced with the received packet.\n", __func__, miss_addr, block_index); - addrResponseQueue.push_back(miss_addr); - worklistResponseQueue.push_back( - cacheBlocks[block_index].items[wl_offset]); + responseQueue.push_back(std::make_tuple(miss_addr, + cacheBlocks[block_index].items[wl_offset])); DPRINTF(MPU, "%s: Pushed cache line[%d][%d] to " - "worklistResponseQueue. worklistResponseQueue.size = %u.\n" + "responseQueue. responseQueue.size = %u.\n" , __func__, block_index, wl_offset, - worklistResponseQueue.size()); + responseQueue.size()); cacheBlocks[block_index].takenMask |= (1 << wl_offset); stats.numVertexReads++; servicedIndices.push_back(i); @@ -336,8 +295,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) } if ((!nextRespondEvent.scheduled()) && - (!worklistResponseQueue.empty()) && - (!addrResponseQueue.empty())) { + (!responseQueue.empty())) { schedule(nextRespondEvent, nextCycle()); } @@ -363,7 +321,8 @@ CoalesceEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data) void CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) { - Addr aligned_addr = (addr / 64) * 64; + // TODO: Parameterize all the numbers here. + Addr aligned_addr = std::floor(addr / 64) * 64; int block_index = (aligned_addr / 64) % 256; int wl_offset = (addr - aligned_addr) / 16; @@ -371,6 +330,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) __func__, wl.to_string(), addr); assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) == (1 << wl_offset)); + + if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) { + cacheBlocks[block_index].hasChange = true; + } + cacheBlocks[block_index].items[wl_offset] = wl; cacheBlocks[block_index].takenMask &= ~(1 << wl_offset); stats.numVertexWrites++; @@ -378,7 +342,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) cacheBlocks[block_index].items[wl_offset].to_string()); // TODO: Make this more general and programmable. - // && (cacheBlocks[block_index].hasConflict) + // TODO: Later on check (cacheBlocks[block_index].hasConflict) to add + // to evictQueue. if ((cacheBlocks[block_index].takenMask == 0)) { DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]." " It does not have any taken items anymore.\n", @@ -389,8 +354,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) } if ((!nextApplyAndCommitEvent.scheduled()) && - (!evictQueue.empty())&& - ((!alarmRequested) && (spaceRequested == 0))) { + (!evictQueue.empty()) && + (pendingAlarm())) { schedule(nextApplyAndCommitEvent, nextCycle()); } @@ -399,36 +364,45 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) void CoalesceEngine::processNextApplyAndCommitEvent() { - assert((!alarmRequested) && (spaceRequested == 0)); + // FIXME: Refactor the line below to work with the new inheritance. + // assert((!alarmRequested) && (spaceRequested == 0)); int block_index = evictQueue.front(); uint8_t changedMask = 0; - // TODO: parameterize 64 to memory atom size - uint8_t* wl_data; - uint8_t data[64]; DPRINTF(MPU, "%s: Received nextApplyAndCommitEvent for cache line[%d].\n", __func__, block_index); DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and " "then commited.\n", __func__, block_index); - if ((cacheBlocks[block_index].hasConflict) && - (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)) { + if ((cacheBlocks[block_index].hasChange)&& + (cacheBlocks[block_index].hasConflict) && + (memReqQueueHasSpace(2))) { DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n", __func__, block_index); - - } else if ((!cacheBlocks[block_index].hasConflict) && - (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) { + } else if ((cacheBlocks[block_index].hasChange) && + (!cacheBlocks[block_index].hasConflict) && + (memReqQueueHasSpace(1))) { + DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n", + __func__, block_index); + } else if ((!cacheBlocks[block_index].hasChange) && + (cacheBlocks[block_index].hasConflict) && + (memReqQueueHasSpace(1))) { DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n", __func__, block_index); + } else if ((!cacheBlocks[block_index].hasChange) && + (!cacheBlocks[block_index].hasConflict)) { + DPRINTF(MPU, "%s: No ApplyAndCommit needed for cache line[%d].\n", + __func__, block_index); } else { - alarmRequested = true; - spaceRequested = cacheBlocks[block_index].hasConflict ? 2 : 1; + int spaceNeeded = cacheBlocks[block_index].hasConflict ? 2 : 1; + requestAlarm(spaceNeeded); DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set " - "an alarm for nextApplyAndCommitEvent when space = %d.\n", - __func__, spaceRequested); + "an alarm for nextApplyAndCommitEvent when there is %d space.\n", + __func__, spaceNeeded); return; } + // Reducing between tempProp and prop for each item in the cache line. for (int i = 0; i < 4; i++) { uint32_t old_prop = cacheBlocks[block_index].items[i].prop; cacheBlocks[block_index].items[i].prop = std::min( @@ -442,23 +416,18 @@ CoalesceEngine::processNextApplyAndCommitEvent() DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n", __func__, block_index, i); } - wl_data = (uint8_t*) (cacheBlocks[block_index].items + i); - std::memcpy(data + (i * sizeof(WorkListItem)), - wl_data, sizeof(WorkListItem)); } - if (changedMask) { + if (cacheBlocks[block_index].hasChange) { DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n" , __func__, block_index); - assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize); + // TODO: Parameterize this 64 to memory atom size PacketPtr write_pkt = createWritePacket( - cacheBlocks[block_index].addr, 64, data); + cacheBlocks[block_index].addr, 64, + (uint8_t*) cacheBlocks[block_index].items); DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n", __func__, write_pkt->getAddr()); if (cacheBlocks[block_index].hasConflict) { - assert( - outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1 - ); DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is " "enough space in outstandingMemReqQueue for the write " "back packet and its subsequent read packet.\n", @@ -467,18 +436,19 @@ CoalesceEngine::processNextApplyAndCommitEvent() DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is" " Addr: %lu.\n", __func__, block_index, miss_addr); // TODO: parameterize 64 - Addr aligned_miss_addr = (miss_addr / 64) * 64; + Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64; PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64); DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." " req addr (aligned_addr) = %lu, size = 64.\n", __func__, miss_addr, aligned_miss_addr); - outstandingMemReqQueue.push_back(write_pkt); - outstandingMemReqQueue.push_back(read_pkt); + + enqueueMemReq(write_pkt); + stats.numVertexBlockWrites++; + enqueueMemReq(read_pkt); DPRINTF(MPU, "%s: Added the evicting write back packet along with " "its subsequent read packet (to service the conflicts)" - " to outstandingMemReqQueue. " - "outstandingMemReqQueue.size = %u.\n", __func__, - outstandingMemReqQueue.size()); + " to outstandingMemReqQueue.\n" , __func__); + // TODO: This should be improved if ((changedMask & (1)) == 1) { peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]); @@ -500,22 +470,25 @@ CoalesceEngine::processNextApplyAndCommitEvent() DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", __func__, block_index, 3); } + // TODO: This should be improved + + cacheBlocks[block_index].addr = aligned_miss_addr; cacheBlocks[block_index].takenMask = 0; cacheBlocks[block_index].allocated = true; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = true; - evictQueue.pop_front(); + cacheBlocks[block_index].hasChange = false; DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size " " = %u.\n", __func__, evictQueue.size()); } else { - assert(outstandingMemReqQueue.size() < outstandingMemReqQueueSize); DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is " "enough space in outstandingMemReqQueue for the write back" " packet.\n", __func__, block_index); - outstandingMemReqQueue.push_back(write_pkt); + enqueueMemReq(write_pkt); + stats.numVertexBlockWrites++; DPRINTF(MPU, "%s: Added the write back packet to " - "outstandingMemReqQueue. oustandingMemReqQueue.size = " - "%u.\n", __func__, outstandingMemReqQueue.size()); + "outstandingMemReqQueue.\n", __func__); + // TODO: This should be improved if ((changedMask & (1)) == 1) { peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]); @@ -537,33 +510,58 @@ CoalesceEngine::processNextApplyAndCommitEvent() DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", __func__, block_index, 3); } + + // Since allocated is false, does not matter what the address is. cacheBlocks[block_index].takenMask = 0; cacheBlocks[block_index].allocated = false; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = false; - evictQueue.pop_front(); + cacheBlocks[block_index].hasChange = false; DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size " " = %u.\n", __func__, evictQueue.size()); } } else { - cacheBlocks[block_index].takenMask = 0; - cacheBlocks[block_index].allocated = false; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].hasConflict = false; DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write " - "backs are necessary. Deallocated cache line[%d].\n", + "backs are necessary.\n", __func__, block_index, block_index); - evictQueue.pop_front(); - DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size " - " = %u.\n", __func__, evictQueue.size()); - } + if (cacheBlocks[block_index].hasConflict) { + DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is " + "enough space in outstandingMemReqQueue for the write " + "back packet and its subsequent read packet.\n", + __func__, block_index); + Addr miss_addr = MSHRMap[block_index][0]; + DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is" + " Addr: %lu.\n", __func__, block_index, miss_addr); + // TODO: parameterize 64 + Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64; + PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64); + DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." + " req addr (aligned_addr) = %lu, size = 64.\n", + __func__, miss_addr, aligned_miss_addr); + enqueueMemReq(read_pkt); + + cacheBlocks[block_index].addr = aligned_miss_addr; + cacheBlocks[block_index].takenMask = 0; + cacheBlocks[block_index].allocated = true; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].hasConflict = true; + cacheBlocks[block_index].hasChange = false; + } else { + DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just " + "deallocating the line.\n", __func__, block_index); - if ((!nextMemReqEvent.scheduled()) && - (!outstandingMemReqQueue.empty())) { - stats.numVertexBlockWrites++; - schedule(nextMemReqEvent, nextCycle()); + cacheBlocks[block_index].takenMask = 0; + cacheBlocks[block_index].allocated = false; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].hasConflict = false; + cacheBlocks[block_index].hasChange = false; + } } + evictQueue.pop_front(); + DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size " + " = %u.\n", __func__, evictQueue.size()); + if ((!nextApplyAndCommitEvent.scheduled()) && (!evictQueue.empty())) { schedule(nextApplyAndCommitEvent, nextCycle()); diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 902a960301..6a8aadcbae 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -53,6 +53,7 @@ class CoalesceEngine : public BaseReadEngine bool allocated; bool valid; bool hasConflict; + bool hasChange; // TODO: This might be useful in the future // Tick lastWLWriteTick; Block(): @@ -60,7 +61,8 @@ class CoalesceEngine : public BaseReadEngine takenMask(0), allocated(false), valid(false), - hasConflict(false) + hasConflict(false), + hasChange(false) {} }; @@ -73,13 +75,7 @@ class CoalesceEngine : public BaseReadEngine int numTgtsPerMSHR; std::unordered_map> MSHRMap; - int outstandingMemReqQueueSize; - bool alarmRequested; - int spaceRequested; - std::deque outstandingMemReqQueue; - - std::deque addrResponseQueue; - std::deque worklistResponseQueue; + std::deque> responseQueue; std::deque evictQueue; @@ -88,9 +84,6 @@ class CoalesceEngine : public BaseReadEngine PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data); // PacketPtr createWritePacket(Addr addr, unsigned int size, WorkListItem wl); - EventFunctionWrapper nextMemReqEvent; - void processNextMemReqEvent(); - EventFunctionWrapper nextRespondEvent; void processNextRespondEvent(); @@ -115,6 +108,7 @@ class CoalesceEngine : public BaseReadEngine CoalesceStats stats; protected: + virtual void respondToAlarm(); virtual bool handleMemResp(PacketPtr pkt); public: diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index c9ed781d79..86418ac76e 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -39,10 +39,7 @@ PushEngine::PushEngine(const PushEngineParams ¶ms): reqPort(name() + ".req_port", this), baseEdgeAddr(params.base_edge_addr), pushReqQueueSize(params.push_req_queue_size), - memRespQueueSize(params.mem_resp_queue_size), - onTheFlyReadReqs(0), nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()), - nextReadEvent([this] { processNextReadEvent(); }, name()), nextPushEvent([this] { processNextPushEvent(); }, name()) {} @@ -66,12 +63,13 @@ PushEngine::startup() *tempPtr = 0; // PacketPtr first_update = createUpdatePacket(0, 4, first_update_data); - PacketPtr first_update = createUpdatePacket(0, 4, (uint32_t) 0); + PacketPtr first_update = createUpdatePacket(0, (uint32_t) 0); - sendPushUpdate(first_update); + if (!reqPort.blocked()) { + reqPort.sendPacket(first_update); + } } - void PushEngine::ReqPort::sendPacket(PacketPtr pkt) { @@ -108,19 +106,21 @@ PushEngine::ReqPort::recvReqRetry() bool PushEngine::recvWLItem(WorkListItem wl) { - assert(pushReqQueue.size() <= pushReqQueueSize); + assert((pushReqQueueSize == 0) || + (pushReqQueue.size() <= pushReqQueueSize)); if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) { return false; } Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge)); Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); - uint32_t update_value = wl.prop; - pushReqQueue.push_back( - std::make_pair(std::make_pair(start_addr, end_addr), update_value)); + uint32_t value = wl.prop; - if ((!nextAddrGenEvent.scheduled()) && - (!pushReqQueue.empty())) { + // TODO: parameterize 64 to memory atom size + pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), 64, value); + + assert(!pushReqQueue.empty()); + if (!nextAddrGenEvent.scheduled()) { schedule(nextAddrGenEvent, nextCycle()); } return true; @@ -129,65 +129,44 @@ PushEngine::recvWLItem(WorkListItem wl) void PushEngine::processNextAddrGenEvent() { - Addr start_addr, end_addr; - uint32_t update_value; - std::pair, uint32_t> front = pushReqQueue.front(); - std::tie(start_addr, end_addr) = front.first; - update_value = front.second; + Addr aligned_addr, offset; + int num_edges; - Addr req_addr = (start_addr / 64) * 64; - Addr req_offset = start_addr % 64; - int num_edges = 0; + PushPacketInfoGen curr_info = pushReqQueue.front(); + std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo(); - if (end_addr > req_addr + 64) { - num_edges = (req_addr + 64 - start_addr) / sizeof(Edge); - } else { - num_edges = (end_addr - start_addr) / sizeof(Edge); - } - PacketPtr pkt = createReadPacket(req_addr, 64); - reqOffsetMap[pkt->req] = req_offset; + PacketPtr pkt = createReadPacket(aligned_addr, 64); + reqOffsetMap[pkt->req] = offset; reqNumEdgeMap[pkt->req] = num_edges; - reqValueMap[pkt->req] = update_value; - pendingReadReqs.push_back(pkt); + reqValueMap[pkt->req] = curr_info.value(); - pushReqQueue.pop_front(); + enqueueMemReq(pkt); - if (req_addr + 64 < end_addr) { - pushReqQueue.push_front( - std::make_pair(std::make_pair(req_addr + 64, end_addr), update_value) - ); + if (curr_info.done()) { + pushReqQueue.pop_front(); } - if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) { - schedule(nextAddrGenEvent, nextCycle()); + if ((memReqQueueFull()) && (!pushReqQueue.empty())) { + requestAlarm(1); + return; } - if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) { - schedule(nextReadEvent, nextCycle()); + if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) { + schedule(nextAddrGenEvent, nextCycle()); } } void -PushEngine::processNextReadEvent() +PushEngine::respondToAlarm() { - if (((memRespQueue.size() + onTheFlyReadReqs) <= memRespQueueSize) && - (!memPortBlocked())) { - PacketPtr pkt = pendingReadReqs.front(); - sendMemReq(pkt); - onTheFlyReadReqs++; - pendingReadReqs.pop_front(); - } - - if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) { - schedule(nextReadEvent, nextCycle()); - } + assert(!nextAddrGenEvent.scheduled()); + schedule(nextAddrGenEvent, nextCycle()); } bool PushEngine::handleMemResp(PacketPtr pkt) { - onTheFlyReadReqs--; memRespQueue.push_back(pkt); if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) { @@ -201,39 +180,42 @@ void PushEngine::processNextPushEvent() { PacketPtr pkt = memRespQueue.front(); - RequestPtr req = pkt->req; - uint8_t *data = pkt->getPtr(); + uint8_t* data = pkt->getPtr(); - Addr offset = reqOffsetMap[req]; - uint32_t value = reqValueMap[req]; + Addr offset = reqOffsetMap[pkt->req]; + assert(offset < 64); + uint32_t value = reqValueMap[pkt->req]; DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu, " "offset: %lu\n", __func__, pkt->getAddr(), offset); - Edge* e = (Edge*) (data + offset); - // DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string()); + Edge* curr_edge = (Edge*) (data + offset); // TODO: Implement propagate function here uint32_t update_value = value + 1; DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n", - __func__, e->neighbor, update_value); + __func__, curr_edge->neighbor, update_value); - PacketPtr update = createUpdatePacket(e->neighbor, - sizeof(uint32_t), update_value); + PacketPtr update = createUpdatePacket( + curr_edge->neighbor, update_value); - if (sendPushUpdate(update)) { + if (!reqPort.blocked()) { DPRINTF(MPU, "%s: Send a push update to addr: %lu with value: %d.\n", - __func__, e->neighbor, update_value); - reqOffsetMap[req] = reqOffsetMap[req] + sizeof(Edge); - reqNumEdgeMap[req]--; - } - - if (reqNumEdgeMap[req] == 0) { + __func__, curr_edge->neighbor, update_value); + reqPort.sendPacket(update); + reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge); + assert(reqOffsetMap[pkt->req] <= 64); + reqNumEdgeMap[pkt->req]--; + assert(reqNumEdgeMap[pkt->req] >= 0); + } + + if (reqNumEdgeMap[pkt->req] == 0) { + reqOffsetMap.erase(pkt->req); + reqNumEdgeMap.erase(pkt->req); + reqValueMap.erase(pkt->req); + delete pkt; memRespQueue.pop_front(); - reqOffsetMap.erase(req); - reqNumEdgeMap.erase(req); - reqValueMap.erase(req); } if (!nextPushEvent.scheduled() && !memRespQueue.empty()) { @@ -241,11 +223,11 @@ PushEngine::processNextPushEvent() } } -PacketPtr -// PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data) -PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value) +template PacketPtr +PushEngine::createUpdatePacket(Addr addr, T value) { - RequestPtr req = std::make_shared(addr, size, 0, _requestorId); + RequestPtr req = std::make_shared( + addr, sizeof(T), 0, _requestorId); // Dummy PC to have PC-based prefetchers latch on; get entropy into higher // bits req->setPC(((Addr) _requestorId) << 2); @@ -255,19 +237,9 @@ PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value) pkt->allocate(); // pkt->setData(data); - pkt->setLE(value); + pkt->setLE(value); return pkt; } -bool -PushEngine::sendPushUpdate(PacketPtr pkt) -{ - if (!reqPort.blocked()) { - reqPort.sendPacket(pkt); - return true; - } - return false; -} - } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index a539079ede..2aba0ca008 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -39,6 +39,42 @@ namespace gem5 class PushEngine : public BaseReadEngine { private: + class PushPacketInfoGen { + private: + Addr _start; + Addr _end; + size_t _step; + size_t _atom; + uint32_t _value; + + public: + PushPacketInfoGen(Addr start, Addr end, size_t step, + size_t atom, uint32_t value): + _start(start), _end(end), _step(step), + _atom(atom), _value(value) + {} + + std::tuple nextReadPacketInfo() + { + panic_if(done(), "Should not call nextPacketInfo when done.\n"); + Addr aligned_addr = std::floor(_start / _atom) * _atom; + Addr offset = _start - aligned_addr; + int num_items = 0; + + if (_end > (_start + _atom)) { + num_items = (_atom - offset) / _step; + } else { + num_items = (_end - _start) / _step; + } + _start = aligned_addr + _atom; + + return std::make_tuple(aligned_addr, offset, num_items); + } + + uint32_t value() { return _value; } + bool done() { return (_start >= _end); } + }; + class ReqPort : public RequestPort { private: @@ -64,37 +100,30 @@ class PushEngine : public BaseReadEngine Addr baseEdgeAddr; int pushReqQueueSize; - std::deque, uint32_t>> pushReqQueue; + std::deque pushReqQueue; // TODO: Add size one size for all these maps std::unordered_map reqOffsetMap; std::unordered_map reqNumEdgeMap; std::unordered_map reqValueMap; - // TODO: Possibility of infinite queueing - std::deque pendingReadReqs; - - int memRespQueueSize; - int onTheFlyReadReqs; + // Since the push engine can process incoming packets faster than + // memory can send those packets, the size of this queue will + // always be limited by the b/w of the memory. std::deque memRespQueue; virtual void startup(); - // PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t* data); - PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint32_t value); - - bool sendPushUpdate(PacketPtr pkt); + template PacketPtr createUpdatePacket(Addr addr, T value); EventFunctionWrapper nextAddrGenEvent; void processNextAddrGenEvent(); - EventFunctionWrapper nextReadEvent; - void processNextReadEvent(); - EventFunctionWrapper nextPushEvent; void processNextPushEvent(); protected: + virtual void respondToAlarm(); virtual bool handleMemResp(PacketPtr pkt); public: diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index ea45cae652..cca945ce0a 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -118,9 +118,10 @@ WLEngine::getAddrRanges() const void WLEngine::processNextReadEvent() { - PacketPtr update = updateQueue.front(); - Addr update_addr = update->getAddr(); - uint32_t update_value = update->getLE(); + Addr update_addr; + uint32_t update_value; + std::tie(update_addr, update_value) = updateQueue.front(); + DPRINTF(MPU, "%s: Looking at the front of the updateQueue. Addr: %lu, " "value: %u.\n", __func__, update_addr, update_value); @@ -229,10 +230,11 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt) return false; } - updateQueue.push_back(pkt); + updateQueue.emplace_back(pkt->getAddr(), pkt->getLE()); DPRINTF(MPU, "%s: Pushed an item to the front of updateQueue" ". updateQueue.size = %u.\n", __func__, updateQueue.size()); + delete pkt; assert(!updateQueue.empty()); if (!nextReadEvent.scheduled()) { schedule(nextReadEvent, nextCycle()); diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 476c9be932..12df93ee79 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -71,7 +71,7 @@ class WLEngine : public BaseReduceEngine CoalesceEngine* coalesceEngine; int updateQueueSize; - std::deque updateQueue; + std::deque> updateQueue; int onTheFlyUpdateMapSize; std::unordered_map onTheFlyUpdateMap; From 02f7baf9938e2a9b30ea3d9b44140862160b5aba Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 8 Apr 2022 13:13:37 -0700 Subject: [PATCH 084/247] Fixing one scheduling error in events. --- configs/accl/sega.py | 7 +++---- src/accl/graph/base/base_read_engine.cc | 12 ++++++++++++ src/accl/graph/base/base_read_engine.hh | 2 ++ src/accl/graph/sega/coalesce_engine.cc | 8 ++++---- src/accl/graph/sega/push_engine.cc | 23 +++++++++++++++++------ src/accl/graph/sega/wl_engine.cc | 4 ++-- 6 files changed, 40 insertions(+), 16 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 8e24280366..e45580dd37 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -5,13 +5,12 @@ class MPU(SubSystem): def __init__(self, base_edge_addr): super(MPU, self).__init__() self.push_engine = PushEngine(base_edge_addr=base_edge_addr, - push_req_queue_size=16, - mem_resp_queue_size=8) + push_req_queue_size=16) self.coalesce_engine = CoalesceEngine( peer_push_engine=self.push_engine) self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, - update_queue_size=16, - on_the_fly_update_map_size=8) + update_queue_size=16, + on_the_fly_update_map_size=8) self.interconnect = SystemXBar() self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc index e3b588cfc6..1658d85627 100644 --- a/src/accl/graph/base/base_read_engine.cc +++ b/src/accl/graph/base/base_read_engine.cc @@ -86,6 +86,8 @@ BaseReadEngine::MemPort::recvReqRetry() if (!blocked()) { blockedPacket = nullptr; } + + owner->wakeUp(); } void @@ -177,8 +179,18 @@ BaseReadEngine::requestAlarm(int space) { panic_if((alarmRequested == true) || (spaceRequested != 0), "You should not request another alarm without the first one being" "responded to.\n"); + DPRINTF(MPU, "%s: Alarm requested with space = %d.\n", __func__, space); alarmRequested = true; spaceRequested = space; } +void +BaseReadEngine::wakeUp() +{ + if ((!nextMemReqEvent.scheduled()) && + (!outstandingMemReqQueue.empty())) { + schedule(nextMemReqEvent, nextCycle()); + } +} + } diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh index bec922beef..5275f86449 100644 --- a/src/accl/graph/base/base_read_engine.hh +++ b/src/accl/graph/base/base_read_engine.hh @@ -108,6 +108,8 @@ class BaseReadEngine : public ClockedObject void recvFunctional(PacketPtr pkt); + void wakeUp(); + }; } diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 015629245b..c740597a2c 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -202,8 +202,8 @@ CoalesceEngine::processNextRespondEvent() __func__, worklist_response.to_string(), addr_response); responseQueue.pop_front(); - DPRINTF(MPU, "%s: Popped a response from worklistResponseQueue. " - "worklistResponseQueue.size = %d.\n", __func__, + DPRINTF(MPU, "%s: Popped a response from responseQueue. " + "responseQueue.size = %d.\n", __func__, responseQueue.size()); if ((!nextRespondEvent.scheduled()) && @@ -338,7 +338,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) cacheBlocks[block_index].items[wl_offset] = wl; cacheBlocks[block_index].takenMask &= ~(1 << wl_offset); stats.numVertexWrites++; - DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, + DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index, cacheBlocks[block_index].items[wl_offset].to_string()); // TODO: Make this more general and programmable. @@ -355,7 +355,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) if ((!nextApplyAndCommitEvent.scheduled()) && (!evictQueue.empty()) && - (pendingAlarm())) { + (!pendingAlarm())) { schedule(nextApplyAndCommitEvent, nextCycle()); } diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 86418ac76e..3c1a98c69a 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -120,7 +120,8 @@ PushEngine::recvWLItem(WorkListItem wl) pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), 64, value); assert(!pushReqQueue.empty()); - if (!nextAddrGenEvent.scheduled()) { + if ((!nextAddrGenEvent.scheduled()) && + (!memReqQueueFull())) { schedule(nextAddrGenEvent, nextCycle()); } return true; @@ -133,8 +134,11 @@ PushEngine::processNextAddrGenEvent() Addr aligned_addr, offset; int num_edges; - PushPacketInfoGen curr_info = pushReqQueue.front(); + PushPacketInfoGen &curr_info = pushReqQueue.front(); std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo(); + DPRINTF(MPU, "%s: Current packet information generated by " + "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, " + "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges); PacketPtr pkt = createReadPacket(aligned_addr, 64); reqOffsetMap[pkt->req] = offset; @@ -144,11 +148,17 @@ PushEngine::processNextAddrGenEvent() enqueueMemReq(pkt); if (curr_info.done()) { + DPRINTF(MPU, "%s: Current PushPacketInfoGen is done.\n", __func__); pushReqQueue.pop_front(); + DPRINTF(MPU, "%s: Popped curr_info from pushReqQueue. " + "pushReqQueue.size() = %u.\n", + __func__, pushReqQueue.size()); } - if ((memReqQueueFull()) && (!pushReqQueue.empty())) { - requestAlarm(1); + if (memReqQueueFull()) { + if (!pushReqQueue.empty()) { + requestAlarm(1); + } return; } @@ -162,6 +172,7 @@ PushEngine::respondToAlarm() { assert(!nextAddrGenEvent.scheduled()); schedule(nextAddrGenEvent, nextCycle()); + DPRINTF(MPU, "%s: Responded to an alarm.\n", __func__); } bool @@ -201,9 +212,9 @@ PushEngine::processNextPushEvent() curr_edge->neighbor, update_value); if (!reqPort.blocked()) { - DPRINTF(MPU, "%s: Send a push update to addr: %lu with value: %d.\n", - __func__, curr_edge->neighbor, update_value); reqPort.sendPacket(update); + DPRINTF(MPU, "%s: Sent a push update to addr: %lu with value: %d.\n", + __func__, curr_edge->neighbor, update_value); reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge); assert(reqOffsetMap[pkt->req] <= 64); reqNumEdgeMap[pkt->req]--; diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index cca945ce0a..ad9e93ba60 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -146,7 +146,7 @@ WLEngine::processNextReadEvent() } else { // TODO: Generalize this to reduce function rather than just min DPRINTF(MPU, "%s: Found the addr: %lu in onTheFlyUpdateMap. " - "onTheFlyUpdateMap[%lu] = %u.", __func__, update_addr, + "onTheFlyUpdateMap[%lu] = %u.\n", __func__, update_addr, update_addr, onTheFlyUpdateMap[update_addr]); onTheFlyUpdateMap[update_addr] = std::min(update_value, onTheFlyUpdateMap[update_addr]); @@ -231,7 +231,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt) } updateQueue.emplace_back(pkt->getAddr(), pkt->getLE()); - DPRINTF(MPU, "%s: Pushed an item to the front of updateQueue" + DPRINTF(MPU, "%s: Pushed an item to the back of updateQueue" ". updateQueue.size = %u.\n", __func__, updateQueue.size()); delete pkt; From 4f58d86c6eae6696ffaf735d5999400db0310d46 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 10 Apr 2022 16:42:27 -0700 Subject: [PATCH 085/247] Works!!!!!! --- configs/accl/sega.py | 4 ++-- src/accl/graph/TODO.md | 6 ++++++ src/accl/graph/sega/push_engine.cc | 8 ++++++++ src/accl/graph/sega/push_engine.hh | 2 +- 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index e45580dd37..e68097ce74 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -11,7 +11,7 @@ def __init__(self, base_edge_addr): self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, update_queue_size=16, on_the_fly_update_map_size=8) - self.interconnect = SystemXBar() + self.interconnect = SystemXBar(max_routing_table_size=16384) self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port self.interconnect.cpu_side_ports = self.push_engine.mem_port @@ -40,7 +40,7 @@ def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary): self.edge_mem_ctrl = SimpleMemory( range=edge_range, bandwidth="25GB/s", latency="30ns", image_file=edge_binary) - self.interconnect = SystemXBar() + self.interconnect = SystemXBar(max_routing_table_size=16384) self.interconnect.mem_side_ports = self.vertex_mem_ctrl.port self.interconnect.mem_side_ports = self.edge_mem_ctrl.port diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md index a0e2cefeff..f6d77d5e22 100644 --- a/src/accl/graph/TODO.md +++ b/src/accl/graph/TODO.md @@ -7,3 +7,9 @@ and memory atom size in the coalesce engine sure all the simobjects follow that architecture. * implement all the communications between simobjects as req/retry. * get rid of maps with RequestPtr as keys + + +Advice from Jason: +* use tryEnqueueMemReq that returns a boolean that shows if it has succeeded to enqueue the request. +* if it +* scratch all of these \ No newline at end of file diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 3c1a98c69a..1fced87a43 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -106,6 +106,14 @@ PushEngine::ReqPort::recvReqRetry() bool PushEngine::recvWLItem(WorkListItem wl) { + // If there are no outdoing edges, no need to generate and push + // updates. Therefore, we only need to return true. + if (wl.degree == 0) { + DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n", + __func__, wl.to_string()); + return true; + } + assert((pushReqQueueSize == 0) || (pushReqQueue.size() <= pushReqQueueSize)); if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) { diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 2aba0ca008..29d18709ee 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -61,7 +61,7 @@ class PushEngine : public BaseReadEngine Addr offset = _start - aligned_addr; int num_items = 0; - if (_end > (_start + _atom)) { + if (_end > (aligned_addr + _atom)) { num_items = (_atom - offset) / _step; } else { num_items = (_end - _start) / _step; From b920f152ec5a935d159e0d36904e7dba5079a502 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 12 Apr 2022 09:59:31 -0700 Subject: [PATCH 086/247] Removing SystemXBar from config script. [has-bug] --- configs/accl/sega.py | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index e68097ce74..dd7623bfea 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -11,10 +11,6 @@ def __init__(self, base_edge_addr): self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, update_queue_size=16, on_the_fly_update_map_size=8) - self.interconnect = SystemXBar(max_routing_table_size=16384) - - self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port - self.interconnect.cpu_side_ports = self.push_engine.mem_port def getRespPort(self): return self.wl_engine.resp_port @@ -26,10 +22,15 @@ def getReqPort(self): def setReqPort(self, port): self.push_engine.req_port = port - def getMemPort(self): - return self.interconnect.mem_side_ports - def setMemPort(self, port): - self.interconnect.mem_side_ports = port + def getVertexMemPort(self): + return self.coalesce_engine.mem_port + def setVertexMemPort(self, port): + self.coalesce_engine.mem_port = port + + def getEdgeMemPort(self): + return self.push_engine.mem_port + def setEdgeMemPort(self, port): + self.push_engine.mem_port = port class MPUMemory(SubSystem): def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary): @@ -40,15 +41,16 @@ def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary): self.edge_mem_ctrl = SimpleMemory( range=edge_range, bandwidth="25GB/s", latency="30ns", image_file=edge_binary) - self.interconnect = SystemXBar(max_routing_table_size=16384) - self.interconnect.mem_side_ports = self.vertex_mem_ctrl.port - self.interconnect.mem_side_ports = self.edge_mem_ctrl.port + def getVertexPort(self): + return self.vertex_mem_ctrl.port + def setVertexPort(self, port): + self.vertex_mem_ctrl.port = port - def getPort(self): - return self.interconnect.cpu_side_ports - def setPort(self, port): - self.interconnect.cpu_side_ports = port + def getEdgePort(self): + return self.edge_mem_ctrl.port + def setEdgePort(self, port): + self.edge_mem_ctrl.port = port class SEGA(System): def __init__(self): @@ -65,7 +67,8 @@ def __init__(self): edge_binary="facebook/graph_binaries/edgelist_0") self.mpu.setReqPort(self.mpu.getRespPort()) - self.mpu.setMemPort(self.mem_ctrl.getPort()) + self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort()) + self.mpu.setEdgeMemPort(self.mem_ctrl.getEdgePort()) system = SEGA() root = Root(full_system = False, system = system) From 58e3b63ea66d9709147566e3e72c882d9bd7216e Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 12 Apr 2022 19:59:24 -0700 Subject: [PATCH 087/247] Fixing the bug when deallocating a taken line. --- configs/accl/sega.py | 4 +- src/accl/graph/base/BaseReadEngine.py | 3 + src/accl/graph/base/base_read_engine.cc | 1 + src/accl/graph/base/base_read_engine.hh | 2 + src/accl/graph/sega/CoalesceEngine.py | 2 + src/accl/graph/sega/coalesce_engine.cc | 388 +++++++++++++----------- 6 files changed, 222 insertions(+), 178 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index dd7623bfea..7f4663cc82 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -36,10 +36,10 @@ class MPUMemory(SubSystem): def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary): super(MPUMemory, self).__init__() self.vertex_mem_ctrl = SimpleMemory( - range=vertex_range, bandwidth="25GB/s", + range=vertex_range, bandwidth="19.2GB/s", latency="30ns", image_file=vertex_binary) self.edge_mem_ctrl = SimpleMemory( - range=edge_range, bandwidth="25GB/s", + range=edge_range, bandwidth="19.2GB/s", latency="30ns", image_file=edge_binary) def getVertexPort(self): diff --git a/src/accl/graph/base/BaseReadEngine.py b/src/accl/graph/base/BaseReadEngine.py index 3ddab2d3c4..d4ab622fd6 100644 --- a/src/accl/graph/base/BaseReadEngine.py +++ b/src/accl/graph/base/BaseReadEngine.py @@ -40,3 +40,6 @@ class BaseReadEngine(ClockedObject): outstanding_mem_req_queue_size = Param.Int(16, "Capacity of queue in " "which memory requests are queued.") + + attached_memory_atom_size = Param.Int(64, "The atom size of the attached " + "memory.") diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc index 1658d85627..19214a3bd1 100644 --- a/src/accl/graph/base/base_read_engine.cc +++ b/src/accl/graph/base/base_read_engine.cc @@ -36,6 +36,7 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams ¶ms): ClockedObject(params), system(params.system), memPort(name() + ".mem_port", this), + peerMemoryAtomSize(params.attached_memory_atom_size), outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size), alarmRequested(false), spaceRequested(0), diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh index 5275f86449..0cab95dbbb 100644 --- a/src/accl/graph/base/base_read_engine.hh +++ b/src/accl/graph/base/base_read_engine.hh @@ -68,6 +68,8 @@ class BaseReadEngine : public ClockedObject System* system; MemPort memPort; + int peerMemoryAtomSize; + int outstandingMemReqQueueSize; bool alarmRequested; int spaceRequested; diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index bec7e3d233..3e5699f552 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -37,3 +37,5 @@ class CoalesceEngine(BaseReadEngine): peer_push_engine = Param.PushEngine(NULL, "") num_mshr_entry = Param.Int(4, "") num_tgts_per_mshr = Param.Int(20, "") + + cache_size = Param.MemorySize("16KiB", "Size of the internal cache.") diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index c740597a2c..41d1fe4953 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -88,7 +88,11 @@ CoalesceEngine::recvReadAddr(Addr addr) __func__, addr, block_index, wl_offset, responseQueue.size(), cacheBlocks[block_index].items[wl_offset].to_string()); // TODO: Use a bitset instead of unsigned int for takenMask + DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index, + cacheBlocks[block_index].takenMask); cacheBlocks[block_index].takenMask |= (1 << wl_offset); + DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index, + cacheBlocks[block_index].takenMask); stats.readHits++; stats.numVertexReads++; @@ -144,7 +148,11 @@ CoalesceEngine::recvReadAddr(Addr addr) return false; } cacheBlocks[block_index].addr = aligned_addr; + DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index, + cacheBlocks[block_index].takenMask); cacheBlocks[block_index].takenMask = 0; + DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index, + cacheBlocks[block_index].takenMask); cacheBlocks[block_index].allocated = true; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = false; @@ -256,7 +264,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) std::vector servicedIndices; for (int i = 0; i < MSHRMap[block_index].size(); i++) { Addr miss_addr = MSHRMap[block_index][i]; - Addr aligned_miss_addr = (miss_addr / 64) * 64; + Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64; if (aligned_miss_addr == addr) { int wl_offset = (miss_addr - aligned_miss_addr) / 16; @@ -269,7 +277,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) "responseQueue. responseQueue.size = %u.\n" , __func__, block_index, wl_offset, responseQueue.size()); + DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index, + cacheBlocks[block_index].takenMask); cacheBlocks[block_index].takenMask |= (1 << wl_offset); + DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index, + cacheBlocks[block_index].takenMask); stats.numVertexReads++; servicedIndices.push_back(i); DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for " @@ -336,7 +348,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) } cacheBlocks[block_index].items[wl_offset] = wl; + DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index, + cacheBlocks[block_index].takenMask); cacheBlocks[block_index].takenMask &= ~(1 << wl_offset); + DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index, + cacheBlocks[block_index].takenMask); stats.numVertexWrites++; DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index, cacheBlocks[block_index].items[wl_offset].to_string()); @@ -373,189 +389,209 @@ CoalesceEngine::processNextApplyAndCommitEvent() __func__, block_index); DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and " "then commited.\n", __func__, block_index); - - if ((cacheBlocks[block_index].hasChange)&& - (cacheBlocks[block_index].hasConflict) && - (memReqQueueHasSpace(2))) { - DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n", - __func__, block_index); - } else if ((cacheBlocks[block_index].hasChange) && - (!cacheBlocks[block_index].hasConflict) && - (memReqQueueHasSpace(1))) { - DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n", - __func__, block_index); - } else if ((!cacheBlocks[block_index].hasChange) && - (cacheBlocks[block_index].hasConflict) && - (memReqQueueHasSpace(1))) { - DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n", - __func__, block_index); - } else if ((!cacheBlocks[block_index].hasChange) && - (!cacheBlocks[block_index].hasConflict)) { - DPRINTF(MPU, "%s: No ApplyAndCommit needed for cache line[%d].\n", - __func__, block_index); - } else { - int spaceNeeded = cacheBlocks[block_index].hasConflict ? 2 : 1; - requestAlarm(spaceNeeded); - DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set " - "an alarm for nextApplyAndCommitEvent when there is %d space.\n", - __func__, spaceNeeded); - return; - } - - // Reducing between tempProp and prop for each item in the cache line. - for (int i = 0; i < 4; i++) { - uint32_t old_prop = cacheBlocks[block_index].items[i].prop; - cacheBlocks[block_index].items[i].prop = std::min( - cacheBlocks[block_index].items[i].prop, - cacheBlocks[block_index].items[i].tempProp); - DPRINTF(MPU, "%s: Applied cache line[%d][%d] = %s.\n", __func__, - block_index, i, - cacheBlocks[block_index].items[i].to_string()); - if (old_prop != cacheBlocks[block_index].items[i].prop) { - changedMask |= (1 << i); - DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n", - __func__, block_index, i); + if (cacheBlocks[block_index].takenMask == 0) { + if ((cacheBlocks[block_index].hasChange)&& + (cacheBlocks[block_index].hasConflict) && + (memReqQueueHasSpace(2))) { + DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n", + __func__, block_index); + } else if ((cacheBlocks[block_index].hasChange) && + (!cacheBlocks[block_index].hasConflict) && + (memReqQueueHasSpace(1))) { + DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n", + __func__, block_index); + } else if ((!cacheBlocks[block_index].hasChange) && + (cacheBlocks[block_index].hasConflict) && + (memReqQueueHasSpace(1))) { + DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n", + __func__, block_index); + } else if ((!cacheBlocks[block_index].hasChange) && + (!cacheBlocks[block_index].hasConflict)) { + DPRINTF(MPU, "%s: No ApplyAndCommit needed for cache line[%d].\n", + __func__, block_index); + } else { + int spaceNeeded = cacheBlocks[block_index].hasConflict ? 2 : 1; + requestAlarm(spaceNeeded); + DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set " + "an alarm for nextApplyAndCommitEvent when there is %d space.\n", + __func__, spaceNeeded); + return; } - } - if (cacheBlocks[block_index].hasChange) { - DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n" - , __func__, block_index); - // TODO: Parameterize this 64 to memory atom size - PacketPtr write_pkt = createWritePacket( - cacheBlocks[block_index].addr, 64, - (uint8_t*) cacheBlocks[block_index].items); - DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n", - __func__, write_pkt->getAddr()); - if (cacheBlocks[block_index].hasConflict) { - DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is " - "enough space in outstandingMemReqQueue for the write " - "back packet and its subsequent read packet.\n", - __func__, block_index); - Addr miss_addr = MSHRMap[block_index][0]; - DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is" - " Addr: %lu.\n", __func__, block_index, miss_addr); - // TODO: parameterize 64 - Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64; - PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64); - DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." - " req addr (aligned_addr) = %lu, size = 64.\n", - __func__, miss_addr, aligned_miss_addr); - - enqueueMemReq(write_pkt); - stats.numVertexBlockWrites++; - enqueueMemReq(read_pkt); - DPRINTF(MPU, "%s: Added the evicting write back packet along with " - "its subsequent read packet (to service the conflicts)" - " to outstandingMemReqQueue.\n" , __func__); - - // TODO: This should be improved - if ((changedMask & (1)) == 1) { - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, 0); - } - if ((changedMask & (2)) == 2) { - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, 1); - } - if ((changedMask & (4)) == 4) { - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, 2); + // Reducing between tempProp and prop for each item in the cache line. + for (int i = 0; i < 4; i++) { + uint32_t old_prop = cacheBlocks[block_index].items[i].prop; + cacheBlocks[block_index].items[i].prop = std::min( + cacheBlocks[block_index].items[i].prop, + cacheBlocks[block_index].items[i].tempProp); + DPRINTF(MPU, "%s: Applied cache line[%d][%d] = %s.\n", __func__, + block_index, i, + cacheBlocks[block_index].items[i].to_string()); + if (old_prop != cacheBlocks[block_index].items[i].prop) { + changedMask |= (1 << i); + DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n", + __func__, block_index, i); } - if ((changedMask & (8)) == 8) { - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, 3); + } + + if (cacheBlocks[block_index].hasChange) { + DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n" + , __func__, block_index); + // TODO: Parameterize this 64 to memory atom size + PacketPtr write_pkt = createWritePacket( + cacheBlocks[block_index].addr, 64, + (uint8_t*) cacheBlocks[block_index].items); + DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n", + __func__, write_pkt->getAddr()); + if (cacheBlocks[block_index].hasConflict) { + DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is " + "enough space in outstandingMemReqQueue for the write " + "back packet and its subsequent read packet.\n", + __func__, block_index); + Addr miss_addr = MSHRMap[block_index][0]; + DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is" + " Addr: %lu.\n", __func__, block_index, miss_addr); + // TODO: parameterize 64 + Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64; + PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64); + DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." + " req addr (aligned_addr) = %lu, size = 64.\n", + __func__, miss_addr, aligned_miss_addr); + + enqueueMemReq(write_pkt); + stats.numVertexBlockWrites++; + enqueueMemReq(read_pkt); + DPRINTF(MPU, "%s: Added the evicting write back packet along with " + "its subsequent read packet (to service the conflicts)" + " to outstandingMemReqQueue.\n" , __func__); + + // TODO: This should be improved + if ((changedMask & (1)) == 1) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, 0); + } + if ((changedMask & (2)) == 2) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, 1); + } + if ((changedMask & (4)) == 4) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, 2); + } + if ((changedMask & (8)) == 8) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, 3); + } + // TODO: This should be improved + + cacheBlocks[block_index].addr = aligned_miss_addr; + DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index, + cacheBlocks[block_index].takenMask); + cacheBlocks[block_index].takenMask = 0; + DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index, + cacheBlocks[block_index].takenMask); + cacheBlocks[block_index].allocated = true; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].hasConflict = true; + cacheBlocks[block_index].hasChange = false; + DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size " + " = %u.\n", __func__, evictQueue.size()); + } else { + DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is " + "enough space in outstandingMemReqQueue for the write back" + " packet.\n", __func__, block_index); + enqueueMemReq(write_pkt); + stats.numVertexBlockWrites++; + DPRINTF(MPU, "%s: Added the write back packet to " + "outstandingMemReqQueue.\n", __func__); + + // TODO: This should be improved + if ((changedMask & (1)) == 1) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, 0); + } + if ((changedMask & (2)) == 2) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, 1); + } + if ((changedMask & (4)) == 4) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, 2); + } + if ((changedMask & (8)) == 8) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, 3); + } + + // Since allocated is false, does not matter what the address is. + DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index, + cacheBlocks[block_index].takenMask); + cacheBlocks[block_index].takenMask = 0; + DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index, + cacheBlocks[block_index].takenMask); + cacheBlocks[block_index].allocated = false; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].hasConflict = false; + cacheBlocks[block_index].hasChange = false; + DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size " + " = %u.\n", __func__, evictQueue.size()); } - // TODO: This should be improved - - cacheBlocks[block_index].addr = aligned_miss_addr; - cacheBlocks[block_index].takenMask = 0; - cacheBlocks[block_index].allocated = true; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].hasConflict = true; - cacheBlocks[block_index].hasChange = false; - DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size " - " = %u.\n", __func__, evictQueue.size()); } else { - DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is " - "enough space in outstandingMemReqQueue for the write back" - " packet.\n", __func__, block_index); - enqueueMemReq(write_pkt); - stats.numVertexBlockWrites++; - DPRINTF(MPU, "%s: Added the write back packet to " - "outstandingMemReqQueue.\n", __func__); - - // TODO: This should be improved - if ((changedMask & (1)) == 1) { - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, 0); - } - if ((changedMask & (2)) == 2) { - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, 1); - } - if ((changedMask & (4)) == 4) { - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, 2); - } - if ((changedMask & (8)) == 8) { - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, 3); + DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write " + "backs are necessary.\n", __func__, block_index); + if (cacheBlocks[block_index].hasConflict) { + DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is " + "enough space in outstandingMemReqQueue for the write " + "back packet and its subsequent read packet.\n", + __func__, block_index); + Addr miss_addr = MSHRMap[block_index][0]; + DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is" + " Addr: %lu.\n", __func__, block_index, miss_addr); + // TODO: parameterize 64 + Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64; + PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64); + DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." + " req addr (aligned_addr) = %lu, size = 64.\n", + __func__, miss_addr, aligned_miss_addr); + enqueueMemReq(read_pkt); + + cacheBlocks[block_index].addr = aligned_miss_addr; + DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index, + cacheBlocks[block_index].takenMask); + cacheBlocks[block_index].takenMask = 0; + DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index, + cacheBlocks[block_index].takenMask); + cacheBlocks[block_index].allocated = true; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].hasConflict = true; + cacheBlocks[block_index].hasChange = false; + } else { + DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just " + "deallocating the line.\n", __func__, block_index); + + DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index, + cacheBlocks[block_index].takenMask); + cacheBlocks[block_index].takenMask = 0; + DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index, + cacheBlocks[block_index].takenMask); + cacheBlocks[block_index].allocated = false; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].hasConflict = false; + cacheBlocks[block_index].hasChange = false; } - - // Since allocated is false, does not matter what the address is. - cacheBlocks[block_index].takenMask = 0; - cacheBlocks[block_index].allocated = false; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].hasConflict = false; - cacheBlocks[block_index].hasChange = false; - DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size " - " = %u.\n", __func__, evictQueue.size()); } } else { - DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write " - "backs are necessary.\n", - __func__, block_index, block_index); - if (cacheBlocks[block_index].hasConflict) { - DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is " - "enough space in outstandingMemReqQueue for the write " - "back packet and its subsequent read packet.\n", - __func__, block_index); - Addr miss_addr = MSHRMap[block_index][0]; - DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is" - " Addr: %lu.\n", __func__, block_index, miss_addr); - // TODO: parameterize 64 - Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64; - PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64); - DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." - " req addr (aligned_addr) = %lu, size = 64.\n", - __func__, miss_addr, aligned_miss_addr); - enqueueMemReq(read_pkt); - - cacheBlocks[block_index].addr = aligned_miss_addr; - cacheBlocks[block_index].takenMask = 0; - cacheBlocks[block_index].allocated = true; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].hasConflict = true; - cacheBlocks[block_index].hasChange = false; - } else { - DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just " - "deallocating the line.\n", __func__, block_index); - - cacheBlocks[block_index].takenMask = 0; - cacheBlocks[block_index].allocated = false; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].hasConflict = false; - cacheBlocks[block_index].hasChange = false; - } + DPRINTF(MPU, "%s: cache line[%d] has been read since being scheduled " + "for eviction. Therefore, ignoring the evict schedule.\n", + __func__, block_index); } evictQueue.pop_front(); From 6e7cb504f2c0e2db7e4d1b417994ab53e200ff7c Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 13 Apr 2022 09:46:44 -0700 Subject: [PATCH 088/247] Parameterizing cache_size and memory_atom_size. --- src/accl/graph/TODO.md | 12 --- src/accl/graph/base/base_read_engine.cc | 7 +- src/accl/graph/base/base_read_engine.hh | 4 +- src/accl/graph/sega/CoalesceEngine.py | 5 +- src/accl/graph/sega/coalesce_engine.cc | 127 +++++++++--------------- src/accl/graph/sega/coalesce_engine.hh | 16 +-- src/accl/graph/sega/push_engine.cc | 9 +- 7 files changed, 74 insertions(+), 106 deletions(-) diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md index f6d77d5e22..1cec4dc6f9 100644 --- a/src/accl/graph/TODO.md +++ b/src/accl/graph/TODO.md @@ -1,15 +1,3 @@ # TODO Items - -* use setLE/setBE inside createUpdatePacket and createWritePacket -* parameterize cache size, associativity, maybe latencies, -and memory atom size in the coalesce engine -* look at all the simobjects and come up with a general architecture. Make -sure all the simobjects follow that architecture. * implement all the communications between simobjects as req/retry. * get rid of maps with RequestPtr as keys - - -Advice from Jason: -* use tryEnqueueMemReq that returns a boolean that shows if it has succeeded to enqueue the request. -* if it -* scratch all of these \ No newline at end of file diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc index 19214a3bd1..714a4542f1 100644 --- a/src/accl/graph/base/base_read_engine.cc +++ b/src/accl/graph/base/base_read_engine.cc @@ -36,12 +36,12 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams ¶ms): ClockedObject(params), system(params.system), memPort(name() + ".mem_port", this), - peerMemoryAtomSize(params.attached_memory_atom_size), outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size), alarmRequested(false), spaceRequested(0), nextMemReqEvent([this] { processNextMemReqEvent(); }, name()), - _requestorId(system->getRequestorId(this)) + _requestorId(system->getRequestorId(this)), + peerMemoryAtomSize(params.attached_memory_atom_size) {} BaseReadEngine::~BaseReadEngine() @@ -101,6 +101,9 @@ BaseReadEngine::processNextMemReqEvent() // TODO: Maybe add a DPRINTF here. PacketPtr pkt = outstandingMemReqQueue.front(); memPort.sendPacket(pkt); + DPRINTF(MPU, "%s: Sent a packet to memory with the following info. " + "pkt->addr: %lu, pkt->size: %lu.\n", + __func__, pkt->getAddr(), pkt->getSize()); outstandingMemReqQueue.pop_front(); if (alarmRequested && diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh index 0cab95dbbb..f11459ad6e 100644 --- a/src/accl/graph/base/base_read_engine.hh +++ b/src/accl/graph/base/base_read_engine.hh @@ -68,8 +68,6 @@ class BaseReadEngine : public ClockedObject System* system; MemPort memPort; - int peerMemoryAtomSize; - int outstandingMemReqQueueSize; bool alarmRequested; int spaceRequested; @@ -81,6 +79,8 @@ class BaseReadEngine : public ClockedObject protected: const RequestorID _requestorId; + size_t peerMemoryAtomSize; + void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } bool memReqQueueHasSpace(int space); diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index 3e5699f552..faa5295ed7 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -35,7 +35,10 @@ class CoalesceEngine(BaseReadEngine): cxx_class = 'gem5::CoalesceEngine' peer_push_engine = Param.PushEngine(NULL, "") + + cache_size = Param.MemorySize("16KiB", "Size of the internal cache.") + num_mshr_entry = Param.Int(4, "") num_tgts_per_mshr = Param.Int(20, "") - cache_size = Param.MemorySize("16KiB", "Size of the internal cache.") + diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 41d1fe4953..4d152e375d 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -38,21 +38,17 @@ namespace gem5 CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): BaseReadEngine(params), peerPushEngine(params.peer_push_engine), + numLines((int) (params.cache_size / peerMemoryAtomSize)), + numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), numMSHREntry(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), nextRespondEvent([this] { processNextRespondEvent(); }, name()), nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()), stats(*this) -{} - -void -CoalesceEngine::startup() { - for (int i = 0; i < 256; i++) { - cacheBlocks[i].takenMask = 0; - cacheBlocks[i].allocated = false; - cacheBlocks[i].valid = false; - cacheBlocks[i].hasConflict = false; + cacheBlocks = new Block [numLines]; + for (int i = 0; i < numLines; i++) { + cacheBlocks[i] = Block(numElementsPerLine); } } @@ -74,8 +70,8 @@ CoalesceEngine::recvReadAddr(Addr addr) assert(MSHRMap.size() <= numMSHREntry); DPRINTF(MPU, "%s: Received a read request for address: %lu.\n", __func__, addr); - Addr aligned_addr = (addr / 64) * 64; - int block_index = (aligned_addr / 64) % 256; + Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize; + int block_index = (aligned_addr / peerMemoryAtomSize) % numLines; int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); if ((cacheBlocks[block_index].addr == aligned_addr) && @@ -162,11 +158,11 @@ CoalesceEngine::recvReadAddr(Addr addr) MSHRMap[block_index].push_back(addr); DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache " "line[%d].\n", __func__, addr, block_index); - // TODO: Parameterize 64 to memory atom size - PacketPtr pkt = createReadPacket(aligned_addr, 64); + + PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize); DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." - " req addr (aligned_addr) = %lu, size = 64.\n", - __func__, addr, aligned_addr); + " req addr (aligned_addr) = %lu, size = %d.\n", + __func__, addr, aligned_addr, peerMemoryAtomSize); enqueueMemReq(pkt); DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue.\n", __func__); @@ -240,10 +236,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) Addr addr = pkt->getAddr(); uint8_t* data = pkt->getPtr(); - // TODO: After parameterizing the cache size - // this 256 number should change to the cache - // size parameter. - int block_index = (addr / 64) % 256; + + int block_index = (addr / peerMemoryAtomSize) % numLines; DPRINTF(MPU, "%s: Received a read resposne for Addr: %lu.\n", __func__, pkt->getAddr()); @@ -264,10 +258,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) std::vector servicedIndices; for (int i = 0; i < MSHRMap[block_index].size(); i++) { Addr miss_addr = MSHRMap[block_index][i]; - Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64; + Addr aligned_miss_addr = std::floor(miss_addr / peerMemoryAtomSize) * peerMemoryAtomSize; if (aligned_miss_addr == addr) { - int wl_offset = (miss_addr - aligned_miss_addr) / 16; + int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could " "be serviced with the received packet.\n", __func__, miss_addr, block_index); @@ -334,9 +328,9 @@ void CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) { // TODO: Parameterize all the numbers here. - Addr aligned_addr = std::floor(addr / 64) * 64; - int block_index = (aligned_addr / 64) % 256; - int wl_offset = (addr - aligned_addr) / 16; + Addr aligned_addr = std::floor(addr / peerMemoryAtomSize) * peerMemoryAtomSize; + int block_index = (aligned_addr / peerMemoryAtomSize) % numLines; + int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); DPRINTF(MPU, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n", __func__, wl.to_string(), addr); @@ -437,12 +431,12 @@ CoalesceEngine::processNextApplyAndCommitEvent() if (cacheBlocks[block_index].hasChange) { DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n" , __func__, block_index); - // TODO: Parameterize this 64 to memory atom size + PacketPtr write_pkt = createWritePacket( - cacheBlocks[block_index].addr, 64, + cacheBlocks[block_index].addr, peerMemoryAtomSize, (uint8_t*) cacheBlocks[block_index].items); - DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n", - __func__, write_pkt->getAddr()); + DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = %d.\n", + __func__, write_pkt->getAddr(), peerMemoryAtomSize); if (cacheBlocks[block_index].hasConflict) { DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is " "enough space in outstandingMemReqQueue for the write " @@ -451,12 +445,15 @@ CoalesceEngine::processNextApplyAndCommitEvent() Addr miss_addr = MSHRMap[block_index][0]; DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is" " Addr: %lu.\n", __func__, block_index, miss_addr); - // TODO: parameterize 64 - Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64; - PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64); + + Addr aligned_miss_addr = + std::floor(miss_addr / peerMemoryAtomSize) * + peerMemoryAtomSize; + PacketPtr read_pkt = createReadPacket( + aligned_miss_addr, peerMemoryAtomSize); DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." - " req addr (aligned_addr) = %lu, size = 64.\n", - __func__, miss_addr, aligned_miss_addr); + " req addr (aligned_addr) = %lu, size = %d.\n", + __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize); enqueueMemReq(write_pkt); stats.numVertexBlockWrites++; @@ -465,28 +462,13 @@ CoalesceEngine::processNextApplyAndCommitEvent() "its subsequent read packet (to service the conflicts)" " to outstandingMemReqQueue.\n" , __func__); - // TODO: This should be improved - if ((changedMask & (1)) == 1) { - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, 0); - } - if ((changedMask & (2)) == 2) { - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, 1); - } - if ((changedMask & (4)) == 4) { - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, 2); - } - if ((changedMask & (8)) == 8) { - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, 3); + for (int i = 0; i < numElementsPerLine; i++) { + if ((changedMask & (1 << i)) == (1 << i)) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, i); + } } - // TODO: This should be improved cacheBlocks[block_index].addr = aligned_miss_addr; DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index, @@ -509,26 +491,12 @@ CoalesceEngine::processNextApplyAndCommitEvent() DPRINTF(MPU, "%s: Added the write back packet to " "outstandingMemReqQueue.\n", __func__); - // TODO: This should be improved - if ((changedMask & (1)) == 1) { - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, 0); - } - if ((changedMask & (2)) == 2) { - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, 1); - } - if ((changedMask & (4)) == 4) { - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, 2); - } - if ((changedMask & (8)) == 8) { - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, 3); + for (int i = 0; i < numElementsPerLine; i++) { + if ((changedMask & (1 << i)) == (1 << i)) { + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, i); + } } // Since allocated is false, does not matter what the address is. @@ -555,11 +523,14 @@ CoalesceEngine::processNextApplyAndCommitEvent() Addr miss_addr = MSHRMap[block_index][0]; DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is" " Addr: %lu.\n", __func__, block_index, miss_addr); - // TODO: parameterize 64 - Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64; - PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64); + + Addr aligned_miss_addr = + std::floor(miss_addr / peerMemoryAtomSize) * + peerMemoryAtomSize; + PacketPtr read_pkt = createReadPacket( + aligned_miss_addr, peerMemoryAtomSize); DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." - " req addr (aligned_addr) = %lu, size = 64.\n", + " req addr (aligned_addr) = %lu, size = %d.\n", __func__, miss_addr, aligned_miss_addr); enqueueMemReq(read_pkt); diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 6a8aadcbae..0ddbdfdeb1 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -47,7 +47,7 @@ class CoalesceEngine : public BaseReadEngine private: struct Block { - WorkListItem items[4]; + WorkListItem* items; Addr addr; uint8_t takenMask; bool allocated; @@ -56,20 +56,26 @@ class CoalesceEngine : public BaseReadEngine bool hasChange; // TODO: This might be useful in the future // Tick lastWLWriteTick; - Block(): + Block() {} + Block(int num_elements): addr(0), takenMask(0), allocated(false), valid(false), hasConflict(false), hasChange(false) - {} + { + items = new WorkListItem [num_elements]; + } }; WLEngine* peerWLEngine; PushEngine* peerPushEngine; - Block cacheBlocks[256]; + Block* cacheBlocks; + + int numLines; + int numElementsPerLine; int numMSHREntry; int numTgtsPerMSHR; @@ -79,8 +85,6 @@ class CoalesceEngine : public BaseReadEngine std::deque evictQueue; - virtual void startup(); - PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data); // PacketPtr createWritePacket(Addr addr, unsigned int size, WorkListItem wl); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 1fced87a43..8dcbac0dcc 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -124,8 +124,7 @@ PushEngine::recvWLItem(WorkListItem wl) Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); uint32_t value = wl.prop; - // TODO: parameterize 64 to memory atom size - pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), 64, value); + pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), peerMemoryAtomSize, value); assert(!pushReqQueue.empty()); if ((!nextAddrGenEvent.scheduled()) && @@ -148,7 +147,7 @@ PushEngine::processNextAddrGenEvent() "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, " "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges); - PacketPtr pkt = createReadPacket(aligned_addr, 64); + PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize); reqOffsetMap[pkt->req] = offset; reqNumEdgeMap[pkt->req] = num_edges; reqValueMap[pkt->req] = curr_info.value(); @@ -202,7 +201,7 @@ PushEngine::processNextPushEvent() uint8_t* data = pkt->getPtr(); Addr offset = reqOffsetMap[pkt->req]; - assert(offset < 64); + assert(offset < peerMemoryAtomSize); uint32_t value = reqValueMap[pkt->req]; DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu, " @@ -224,7 +223,7 @@ PushEngine::processNextPushEvent() DPRINTF(MPU, "%s: Sent a push update to addr: %lu with value: %d.\n", __func__, curr_edge->neighbor, update_value); reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge); - assert(reqOffsetMap[pkt->req] <= 64); + assert(reqOffsetMap[pkt->req] <= peerMemoryAtomSize); reqNumEdgeMap[pkt->req]--; assert(reqNumEdgeMap[pkt->req] >= 0); } From c216819f0ff4c7103a6f62e416f897068a460e52 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 13 Apr 2022 10:21:28 -0700 Subject: [PATCH 089/247] Renaming BaseReadEngine to BaseMemEngine. --- configs/accl/sega.py | 6 ++- .../{BaseReadEngine.py => BaseMemEngine.py} | 8 ++-- src/accl/graph/base/SConscript | 4 +- ...base_read_engine.cc => base_mem_engine.cc} | 30 +++++++------- ...base_read_engine.hh => base_mem_engine.hh} | 20 +++++----- src/accl/graph/base/data_structs.hh | 6 +-- src/accl/graph/sega/CoalesceEngine.py | 4 +- src/accl/graph/sega/PushEngine.py | 4 +- src/accl/graph/sega/coalesce_engine.cc | 39 ++----------------- src/accl/graph/sega/coalesce_engine.hh | 4 +- src/accl/graph/sega/push_engine.cc | 4 +- src/accl/graph/sega/push_engine.hh | 4 +- 12 files changed, 52 insertions(+), 81 deletions(-) rename src/accl/graph/base/{BaseReadEngine.py => BaseMemEngine.py} (92%) rename src/accl/graph/base/{base_read_engine.cc => base_mem_engine.cc} (87%) rename src/accl/graph/base/{base_read_engine.hh => base_mem_engine.hh} (88%) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 7f4663cc82..7d8b96490d 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -5,9 +5,11 @@ class MPU(SubSystem): def __init__(self, base_edge_addr): super(MPU, self).__init__() self.push_engine = PushEngine(base_edge_addr=base_edge_addr, - push_req_queue_size=16) + push_req_queue_size=16, + attached_memory_atom_size=64) self.coalesce_engine = CoalesceEngine( - peer_push_engine=self.push_engine) + peer_push_engine=self.push_engine, + attached_memory_atom_size=64) self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, update_queue_size=16, on_the_fly_update_map_size=8) diff --git a/src/accl/graph/base/BaseReadEngine.py b/src/accl/graph/base/BaseMemEngine.py similarity index 92% rename from src/accl/graph/base/BaseReadEngine.py rename to src/accl/graph/base/BaseMemEngine.py index d4ab622fd6..69f68e9dfc 100644 --- a/src/accl/graph/base/BaseReadEngine.py +++ b/src/accl/graph/base/BaseMemEngine.py @@ -29,11 +29,11 @@ from m5.proxy import * from m5.objects.ClockedObject import ClockedObject -class BaseReadEngine(ClockedObject): +class BaseMemEngine(ClockedObject): abstract = True - type = 'BaseReadEngine' - cxx_header = "accl/graph/base/base_read_engine.hh" - cxx_class = 'gem5::BaseReadEngine' + type = 'BaseMemEngine' + cxx_header = "accl/graph/base/base_mem_engine.hh" + cxx_class = 'gem5::BaseMemEngine' system = Param.System(Parent.any, 'System this Engine is a part of') mem_port = RequestPort("Port to communicate with the memory") diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript index ea96f4323b..4c90dfa9a6 100644 --- a/src/accl/graph/base/SConscript +++ b/src/accl/graph/base/SConscript @@ -27,8 +27,8 @@ Import('*') -SimObject('BaseReadEngine.py') +SimObject('BaseMemEngine.py') SimObject('BaseReduceEngine.py') -Source('base_read_engine.cc') +Source('base_mem_engine.cc') Source('base_reduce_engine.cc') diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_mem_engine.cc similarity index 87% rename from src/accl/graph/base/base_read_engine.cc rename to src/accl/graph/base/base_mem_engine.cc index 714a4542f1..50e64ae7c3 100644 --- a/src/accl/graph/base/base_read_engine.cc +++ b/src/accl/graph/base/base_mem_engine.cc @@ -26,13 +26,13 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "accl/graph/base/base_read_engine.hh" +#include "accl/graph/base/base_mem_engine.hh" #include "debug/MPU.hh" namespace gem5 { -BaseReadEngine::BaseReadEngine(const BaseReadEngineParams ¶ms): +BaseMemEngine::BaseMemEngine(const BaseMemEngineParams ¶ms): ClockedObject(params), system(params.system), memPort(name() + ".mem_port", this), @@ -44,11 +44,11 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams ¶ms): peerMemoryAtomSize(params.attached_memory_atom_size) {} -BaseReadEngine::~BaseReadEngine() +BaseMemEngine::~BaseMemEngine() {} Port& -BaseReadEngine::getPort(const std::string &if_name, PortID idx) +BaseMemEngine::getPort(const std::string &if_name, PortID idx) { if (if_name == "mem_port") { return memPort; @@ -58,7 +58,7 @@ BaseReadEngine::getPort(const std::string &if_name, PortID idx) } void -BaseReadEngine::MemPort::sendPacket(PacketPtr pkt) +BaseMemEngine::MemPort::sendPacket(PacketPtr pkt) { panic_if(_blocked, "Should never try to send if blocked MemSide!"); // If we can't send the packet across the port, store it for later. @@ -70,14 +70,14 @@ BaseReadEngine::MemPort::sendPacket(PacketPtr pkt) } bool -BaseReadEngine::MemPort::recvTimingResp(PacketPtr pkt) +BaseMemEngine::MemPort::recvTimingResp(PacketPtr pkt) { //TODO: Investigate sending true all the time return owner->handleMemResp(pkt); } void -BaseReadEngine::MemPort::recvReqRetry() +BaseMemEngine::MemPort::recvReqRetry() { panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); @@ -92,7 +92,7 @@ BaseReadEngine::MemPort::recvReqRetry() } void -BaseReadEngine::processNextMemReqEvent() +BaseMemEngine::processNextMemReqEvent() { if (memPort.blocked()) { return; @@ -120,7 +120,7 @@ BaseReadEngine::processNextMemReqEvent() } PacketPtr -BaseReadEngine::createReadPacket(Addr addr, unsigned int size) +BaseMemEngine::createReadPacket(Addr addr, unsigned int size) { RequestPtr req = std::make_shared(addr, size, 0, _requestorId); // Dummy PC to have PC-based prefetchers latch on; get entropy into higher @@ -135,7 +135,7 @@ BaseReadEngine::createReadPacket(Addr addr, unsigned int size) } PacketPtr -BaseReadEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data) +BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data) { RequestPtr req = std::make_shared(addr, size, 0, _requestorId); @@ -151,7 +151,7 @@ BaseReadEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data) } bool -BaseReadEngine::memReqQueueHasSpace(int space) +BaseMemEngine::memReqQueueHasSpace(int space) { assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize); return ( @@ -160,14 +160,14 @@ BaseReadEngine::memReqQueueHasSpace(int space) } bool -BaseReadEngine::memReqQueueFull() +BaseMemEngine::memReqQueueFull() { assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize); return (outstandingMemReqQueue.size() == outstandingMemReqQueueSize); } void -BaseReadEngine::enqueueMemReq(PacketPtr pkt) +BaseMemEngine::enqueueMemReq(PacketPtr pkt) { panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n"); outstandingMemReqQueue.push_back(pkt); @@ -179,7 +179,7 @@ BaseReadEngine::enqueueMemReq(PacketPtr pkt) } void -BaseReadEngine::requestAlarm(int space) { +BaseMemEngine::requestAlarm(int space) { panic_if((alarmRequested == true) || (spaceRequested != 0), "You should not request another alarm without the first one being" "responded to.\n"); @@ -189,7 +189,7 @@ BaseReadEngine::requestAlarm(int space) { } void -BaseReadEngine::wakeUp() +BaseMemEngine::wakeUp() { if ((!nextMemReqEvent.scheduled()) && (!outstandingMemReqQueue.empty())) { diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_mem_engine.hh similarity index 88% rename from src/accl/graph/base/base_read_engine.hh rename to src/accl/graph/base/base_mem_engine.hh index f11459ad6e..fb7cab91b0 100644 --- a/src/accl/graph/base/base_read_engine.hh +++ b/src/accl/graph/base/base_mem_engine.hh @@ -26,33 +26,33 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__ -#define __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__ +#ifndef __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__ +#define __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__ #include #include "base/addr_range.hh" #include "mem/packet.hh" #include "mem/port.hh" -#include "params/BaseReadEngine.hh" +#include "params/BaseMemEngine.hh" #include "sim/clocked_object.hh" #include "sim/system.hh" namespace gem5 { -class BaseReadEngine : public ClockedObject +class BaseMemEngine : public ClockedObject { private: class MemPort : public RequestPort { private: - BaseReadEngine* owner; + BaseMemEngine* owner; bool _blocked; PacketPtr blockedPacket; public: - MemPort(const std::string& name, BaseReadEngine* owner): + MemPort(const std::string& name, BaseMemEngine* owner): RequestPort(name, owner), owner(owner), _blocked(false), blockedPacket(nullptr) {} @@ -96,10 +96,10 @@ class BaseReadEngine : public ClockedObject PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data); public: - PARAMS(BaseReadEngine); + PARAMS(BaseMemEngine); - BaseReadEngine(const BaseReadEngineParams ¶ms); - ~BaseReadEngine(); + BaseMemEngine(const BaseMemEngineParams ¶ms); + ~BaseMemEngine(); Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; @@ -116,4 +116,4 @@ class BaseReadEngine : public ClockedObject } -#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__ +#endif // __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__ diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index 28a503528f..409245eeaa 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -26,8 +26,8 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#ifndef __ACCL_GRAPH_BASE_UTIL_HH__ -#define __ACCL_GRAPH_BASE_UTIL_HH__ +#ifndef __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__ +#define __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__ #include "base/cprintf.hh" @@ -83,4 +83,4 @@ struct __attribute__ ((packed)) Edge } -#endif // __ACCL_GRAPH_BASE_UTIL_HH__ +#endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__ diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index faa5295ed7..086f284950 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -27,9 +27,9 @@ from m5.params import * from m5.proxy import * -from m5.objects.BaseReadEngine import BaseReadEngine +from m5.objects.BaseMemEngine import BaseMemEngine -class CoalesceEngine(BaseReadEngine): +class CoalesceEngine(BaseMemEngine): type = 'CoalesceEngine' cxx_header = "accl/graph/sega/coalesce_engine.hh" cxx_class = 'gem5::CoalesceEngine' diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py index 645bc5f4ea..d3276799aa 100644 --- a/src/accl/graph/sega/PushEngine.py +++ b/src/accl/graph/sega/PushEngine.py @@ -27,9 +27,9 @@ from m5.params import * from m5.proxy import * -from m5.objects.BaseReadEngine import BaseReadEngine +from m5.objects.BaseMemEngine import BaseMemEngine -class PushEngine(BaseReadEngine): +class PushEngine(BaseMemEngine): type = 'PushEngine' cxx_header = "accl/graph/sega/push_engine.hh" cxx_class = 'gem5::PushEngine' diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 4d152e375d..1c5dee8b8f 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -36,7 +36,7 @@ namespace gem5 { CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): - BaseReadEngine(params), + BaseMemEngine(params), peerPushEngine(params.peer_push_engine), numLines((int) (params.cache_size / peerMemoryAtomSize)), numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), @@ -83,12 +83,8 @@ CoalesceEngine::recvReadAddr(Addr addr) "to responseQueue. responseQueue.size = %d.\n", __func__, addr, block_index, wl_offset, responseQueue.size(), cacheBlocks[block_index].items[wl_offset].to_string()); - // TODO: Use a bitset instead of unsigned int for takenMask - DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index, - cacheBlocks[block_index].takenMask); + cacheBlocks[block_index].takenMask |= (1 << wl_offset); - DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index, - cacheBlocks[block_index].takenMask); stats.readHits++; stats.numVertexReads++; @@ -144,11 +140,7 @@ CoalesceEngine::recvReadAddr(Addr addr) return false; } cacheBlocks[block_index].addr = aligned_addr; - DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index, - cacheBlocks[block_index].takenMask); cacheBlocks[block_index].takenMask = 0; - DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index, - cacheBlocks[block_index].takenMask); cacheBlocks[block_index].allocated = true; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = false; @@ -271,11 +263,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) "responseQueue. responseQueue.size = %u.\n" , __func__, block_index, wl_offset, responseQueue.size()); - DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index, - cacheBlocks[block_index].takenMask); cacheBlocks[block_index].takenMask |= (1 << wl_offset); - DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index, - cacheBlocks[block_index].takenMask); stats.numVertexReads++; servicedIndices.push_back(i); DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for " @@ -342,11 +330,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) } cacheBlocks[block_index].items[wl_offset] = wl; - DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index, - cacheBlocks[block_index].takenMask); cacheBlocks[block_index].takenMask &= ~(1 << wl_offset); - DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index, - cacheBlocks[block_index].takenMask); stats.numVertexWrites++; DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index, cacheBlocks[block_index].items[wl_offset].to_string()); @@ -413,7 +397,7 @@ CoalesceEngine::processNextApplyAndCommitEvent() } // Reducing between tempProp and prop for each item in the cache line. - for (int i = 0; i < 4; i++) { + for (int i = 0; i < numElementsPerLine; i++) { uint32_t old_prop = cacheBlocks[block_index].items[i].prop; cacheBlocks[block_index].items[i].prop = std::min( cacheBlocks[block_index].items[i].prop, @@ -471,11 +455,7 @@ CoalesceEngine::processNextApplyAndCommitEvent() } cacheBlocks[block_index].addr = aligned_miss_addr; - DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index, - cacheBlocks[block_index].takenMask); cacheBlocks[block_index].takenMask = 0; - DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index, - cacheBlocks[block_index].takenMask); cacheBlocks[block_index].allocated = true; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = true; @@ -500,11 +480,8 @@ CoalesceEngine::processNextApplyAndCommitEvent() } // Since allocated is false, does not matter what the address is. - DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index, - cacheBlocks[block_index].takenMask); + cacheBlocks[block_index].takenMask = 0; - DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index, - cacheBlocks[block_index].takenMask); cacheBlocks[block_index].allocated = false; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = false; @@ -535,11 +512,7 @@ CoalesceEngine::processNextApplyAndCommitEvent() enqueueMemReq(read_pkt); cacheBlocks[block_index].addr = aligned_miss_addr; - DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index, - cacheBlocks[block_index].takenMask); cacheBlocks[block_index].takenMask = 0; - DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index, - cacheBlocks[block_index].takenMask); cacheBlocks[block_index].allocated = true; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = true; @@ -548,11 +521,7 @@ CoalesceEngine::processNextApplyAndCommitEvent() DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just " "deallocating the line.\n", __func__, block_index); - DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index, - cacheBlocks[block_index].takenMask); cacheBlocks[block_index].takenMask = 0; - DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index, - cacheBlocks[block_index].takenMask); cacheBlocks[block_index].allocated = false; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = false; diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 0ddbdfdeb1..4c4cb4567b 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -29,7 +29,7 @@ #ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ -#include "accl/graph/base/base_read_engine.hh" +#include "accl/graph/base/base_mem_engine.hh" #include "accl/graph/base/data_structs.hh" #include "accl/graph/sega/push_engine.hh" #include "base/statistics.hh" @@ -42,7 +42,7 @@ namespace gem5 class WLEngine; -class CoalesceEngine : public BaseReadEngine +class CoalesceEngine : public BaseMemEngine { private: struct Block diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 8dcbac0dcc..53cb428b12 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -35,7 +35,7 @@ namespace gem5 { PushEngine::PushEngine(const PushEngineParams ¶ms): - BaseReadEngine(params), + BaseMemEngine(params), reqPort(name() + ".req_port", this), baseEdgeAddr(params.base_edge_addr), pushReqQueueSize(params.push_req_queue_size), @@ -49,7 +49,7 @@ PushEngine::getPort(const std::string &if_name, PortID idx) if (if_name == "req_port") { return reqPort; } else if (if_name == "mem_port") { - return BaseReadEngine::getPort(if_name, idx); + return BaseMemEngine::getPort(if_name, idx); } else { return SimObject::getPort(if_name, idx); } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 29d18709ee..5e8b079d88 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -29,14 +29,14 @@ #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ -#include "accl/graph/base/base_read_engine.hh" +#include "accl/graph/base/base_mem_engine.hh" #include "accl/graph/base/data_structs.hh" #include "params/PushEngine.hh" namespace gem5 { -class PushEngine : public BaseReadEngine +class PushEngine : public BaseMemEngine { private: class PushPacketInfoGen { From 293cb52c7cd6175ee9f5e8e279a363b781ca0b15 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 13 Apr 2022 10:30:08 -0700 Subject: [PATCH 090/247] Adding a new SConscript for src/accl. --- configs/accl/sega.py | 4 ++-- src/accl/graph/SConscript | 30 ++++++++++++++++++++++++++++++ src/accl/graph/sega/SConscript | 2 +- 3 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 src/accl/graph/SConscript diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 7d8b96490d..4168217f4d 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -64,9 +64,9 @@ def __init__(self): self.mpu = MPU(base_edge_addr=0x80000000) self.mem_ctrl = MPUMemory( vertex_range=AddrRange(start=0x000000, size="2GiB"), - vertex_binary="facebook/graph_binaries/vertices", + vertex_binary="graphs/facebook/graph_binaries/vertices", edge_range=AddrRange(start=0x80000000, size="2GiB"), - edge_binary="facebook/graph_binaries/edgelist_0") + edge_binary="graphs/facebook/graph_binaries/edgelist_0") self.mpu.setReqPort(self.mpu.getRespPort()) self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort()) diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript new file mode 100644 index 0000000000..00fa2466dd --- /dev/null +++ b/src/accl/graph/SConscript @@ -0,0 +1,30 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2016 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Import('*') + +DebugFlag('MPU') \ No newline at end of file diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index 9b4629838b..6e563b2677 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -35,4 +35,4 @@ Source('coalesce_engine.cc') Source('push_engine.cc') Source('wl_engine.cc') -DebugFlag('MPU') +DebugFlag('WLWrites') From 5df2ae29e0faaa80cda5721ad137cdc84b6235e8 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 13 Apr 2022 14:11:32 -0700 Subject: [PATCH 091/247] Fixing stats and adding a few new ones. --- configs/accl/sega.py | 4 +-- src/accl/graph/sega/SConscript | 2 +- src/accl/graph/sega/coalesce_engine.cc | 43 ++++++++++++++++++-------- src/accl/graph/sega/coalesce_engine.hh | 4 +-- 4 files changed, 35 insertions(+), 18 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 4168217f4d..0532aa2153 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -64,9 +64,9 @@ def __init__(self): self.mpu = MPU(base_edge_addr=0x80000000) self.mem_ctrl = MPUMemory( vertex_range=AddrRange(start=0x000000, size="2GiB"), - vertex_binary="graphs/facebook/graph_binaries/vertices", + vertex_binary="graphs/epinions/graph_binaries/vertices", edge_range=AddrRange(start=0x80000000, size="2GiB"), - edge_binary="graphs/facebook/graph_binaries/edgelist_0") + edge_binary="graphs/epinions/graph_binaries/edgelist_0") self.mpu.setReqPort(self.mpu.getRespPort()) self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort()) diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index 6e563b2677..19d702c49a 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -35,4 +35,4 @@ Source('coalesce_engine.cc') Source('push_engine.cc') Source('wl_engine.cc') -DebugFlag('WLWrites') +DebugFlag('ApplyUpdates') diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 1c5dee8b8f..36a7ddb6d2 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -29,6 +29,7 @@ #include "accl/graph/sega/coalesce_engine.hh" #include "accl/graph/sega/wl_engine.hh" +#include "debug/ApplyUpdates.hh" #include "debug/MPU.hh" #include "mem/packet_access.hh" @@ -83,16 +84,14 @@ CoalesceEngine::recvReadAddr(Addr addr) "to responseQueue. responseQueue.size = %d.\n", __func__, addr, block_index, wl_offset, responseQueue.size(), cacheBlocks[block_index].items[wl_offset].to_string()); - cacheBlocks[block_index].takenMask |= (1 << wl_offset); - stats.readHits++; - stats.numVertexReads++; assert(!responseQueue.empty()); if (!nextRespondEvent.scheduled()) { schedule(nextRespondEvent, nextCycle()); } + stats.numVertexReads++; return true; } else { // miss @@ -105,6 +104,7 @@ CoalesceEngine::recvReadAddr(Addr addr) // Out of MSHR entries DPRINTF(MPU, "%s: Out of MSHR entries. " "Rejecting request.\n", __func__); + stats.readRejections++; return false; } else { DPRINTF(MPU, "%s: MSHR entries available.\n", __func__); @@ -117,12 +117,15 @@ CoalesceEngine::recvReadAddr(Addr addr) DPRINTF(MPU, "%s: Out of targets for cache line[%d]. " "Rejecting request.\n", __func__, block_index); + stats.readRejections++; return false; } cacheBlocks[block_index].hasConflict = true; MSHRMap[block_index].push_back(addr); DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache " "line[%d]", __func__, addr, block_index); + stats.readMisses++; + stats.numVertexReads++; return true; } else { assert(!cacheBlocks[block_index].valid); @@ -137,6 +140,7 @@ CoalesceEngine::recvReadAddr(Addr addr) if (memReqQueueFull()) { DPRINTF(MPU, "%s: No space in outstandingMemReqQueue. " "Rejecting request.\n", __func__); + stats.readRejections++; return false; } cacheBlocks[block_index].addr = aligned_addr; @@ -158,7 +162,8 @@ CoalesceEngine::recvReadAddr(Addr addr) enqueueMemReq(pkt); DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue.\n", __func__); - stats.numVertexBlockReads++; + stats.readMisses++; + stats.numVertexReads++; return true; } } @@ -169,6 +174,7 @@ CoalesceEngine::recvReadAddr(Addr addr) DPRINTF(MPU, "%s: Out of targets for cache line[%d]. " "Rejecting request.\n", __func__, block_index); + stats.readRejections++; return false; } if ((!cacheBlocks[block_index].hasConflict) && @@ -178,9 +184,17 @@ CoalesceEngine::recvReadAddr(Addr addr) cacheBlocks[block_index].addr); cacheBlocks[block_index].hasConflict = true; } + + if (aligned_addr != cacheBlocks[block_index].addr) { + stats.readMisses++; + } else { + stats.readHits++; + } + MSHRMap[block_index].push_back(addr); DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache " "line[%d].\n", __func__, addr, block_index); + stats.numVertexReads++; return true; } } @@ -264,7 +278,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) , __func__, block_index, wl_offset, responseQueue.size()); cacheBlocks[block_index].takenMask |= (1 << wl_offset); - stats.numVertexReads++; + servicedIndices.push_back(i); DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for " "removal.\n", __func__, i, block_index); @@ -334,7 +348,6 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) stats.numVertexWrites++; DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index, cacheBlocks[block_index].items[wl_offset].to_string()); - // TODO: Make this more general and programmable. // TODO: Later on check (cacheBlocks[block_index].hasConflict) to add // to evictQueue. @@ -440,7 +453,6 @@ CoalesceEngine::processNextApplyAndCommitEvent() __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize); enqueueMemReq(write_pkt); - stats.numVertexBlockWrites++; enqueueMemReq(read_pkt); DPRINTF(MPU, "%s: Added the evicting write back packet along with " "its subsequent read packet (to service the conflicts)" @@ -448,6 +460,9 @@ CoalesceEngine::processNextApplyAndCommitEvent() for (int i = 0; i < numElementsPerLine; i++) { if ((changedMask & (1 << i)) == (1 << i)) { + DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", __func__, + cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)), + cacheBlocks[block_index].items[i].to_string()); peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]); DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", __func__, block_index, i); @@ -467,12 +482,14 @@ CoalesceEngine::processNextApplyAndCommitEvent() "enough space in outstandingMemReqQueue for the write back" " packet.\n", __func__, block_index); enqueueMemReq(write_pkt); - stats.numVertexBlockWrites++; DPRINTF(MPU, "%s: Added the write back packet to " "outstandingMemReqQueue.\n", __func__); for (int i = 0; i < numElementsPerLine; i++) { if ((changedMask & (1 << i)) == (1 << i)) { + DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", __func__, + cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)), + cacheBlocks[block_index].items[i].to_string()); peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]); DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", __func__, block_index, i); @@ -548,16 +565,16 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) : statistics::Group(&_coalesce), coalesce(_coalesce), - ADD_STAT(numVertexBlockReads, statistics::units::Count::get(), - "Number of memory blocks read for vertecies"), - ADD_STAT(numVertexBlockWrites, statistics::units::Count::get(), - "Number of memory blocks writes for vertecies"), ADD_STAT(numVertexReads, statistics::units::Count::get(), "Number of memory vertecies read from cache."), ADD_STAT(numVertexWrites, statistics::units::Count::get(), "Number of memory vertecies written to cache."), ADD_STAT(readHits, statistics::units::Count::get(), - "Number of cache hits.") + "Number of cache hits."), + ADD_STAT(readMisses, statistics::units::Count::get(), + "Number of cache misses."), + ADD_STAT(readRejections, statistics::units::Count::get(), + "Number of cache rejections.") { } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 4c4cb4567b..efd19d3e9b 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -102,11 +102,11 @@ class CoalesceEngine : public BaseMemEngine CoalesceEngine &coalesce; - statistics::Scalar numVertexBlockReads; - statistics::Scalar numVertexBlockWrites; statistics::Scalar numVertexReads; statistics::Scalar numVertexWrites; statistics::Scalar readHits; + statistics::Scalar readMisses; + statistics::Scalar readRejections; }; CoalesceStats stats; From 4e169aa65eb3e7e1302c66c4031695515d613fff Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 15 Apr 2022 15:21:34 -0700 Subject: [PATCH 092/247] Fixing memory atom size issue. --- configs/accl/sega.py | 2 +- src/accl/graph/sega/coalesce_engine.cc | 2 +- src/accl/graph/sega/push_engine.cc | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 0532aa2153..61df2cc2ef 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -9,7 +9,7 @@ def __init__(self, base_edge_addr): attached_memory_atom_size=64) self.coalesce_engine = CoalesceEngine( peer_push_engine=self.push_engine, - attached_memory_atom_size=64) + attached_memory_atom_size=32) self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, update_queue_size=16, on_the_fly_update_map_size=8) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 36a7ddb6d2..e54447fd09 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -251,7 +251,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) (!cacheBlocks[block_index].valid) && // valid is false (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR - for (int i = 0; i < 4; i++) { + for (int i = 0; i < numElementsPerLine; i++) { cacheBlocks[block_index].items[i] = *((WorkListItem*) ( data + (i * sizeof(WorkListItem)))); DPRINTF(MPU, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__, diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 53cb428b12..195cb65dbc 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -185,6 +185,8 @@ PushEngine::respondToAlarm() bool PushEngine::handleMemResp(PacketPtr pkt) { + // TODO: in case we need to edit edges, get rid of second statement. + assert(pkt->isResponse() && (!pkt->isWrite())); memRespQueue.push_back(pkt); if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) { From 7f52d64d0433af8ec9727ef6e6d18c297e039f8e Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 17 Apr 2022 13:34:12 -0700 Subject: [PATCH 093/247] Removing dead code. --- configs/accl/sega.py | 4 ++-- src/accl/graph/sega/push_engine.cc | 5 ----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 61df2cc2ef..450f158f93 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -64,9 +64,9 @@ def __init__(self): self.mpu = MPU(base_edge_addr=0x80000000) self.mem_ctrl = MPUMemory( vertex_range=AddrRange(start=0x000000, size="2GiB"), - vertex_binary="graphs/epinions/graph_binaries/vertices", + vertex_binary="graphs/test-graph/graph_binaries/vertices_0", edge_range=AddrRange(start=0x80000000, size="2GiB"), - edge_binary="graphs/epinions/graph_binaries/edgelist_0") + edge_binary="graphs/test-graph/graph_binaries/edgelist_0") self.mpu.setReqPort(self.mpu.getRespPort()) self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort()) diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 195cb65dbc..716daf92e8 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -58,11 +58,6 @@ PushEngine::getPort(const std::string &if_name, PortID idx) void PushEngine::startup() { - uint8_t* first_update_data = new uint8_t [4]; - uint32_t* tempPtr = (uint32_t*) first_update_data; - *tempPtr = 0; - - // PacketPtr first_update = createUpdatePacket(0, 4, first_update_data); PacketPtr first_update = createUpdatePacket(0, (uint32_t) 0); if (!reqPort.blocked()) { From 2ca8a986a07d819484f5bc40d18101481d6cdf40 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Tue, 19 Apr 2022 12:03:25 -0700 Subject: [PATCH 094/247] [WIP] added the central control unit. It has error about the crossbar --- configs/accl/sega.py | 10 +- src/accl/graph/sega/CenteralController.py | 39 +++++++ src/accl/graph/sega/SConscript | 2 + src/accl/graph/sega/centeral_controller.cc | 123 +++++++++++++++++++++ src/accl/graph/sega/centeral_controller.hh | 84 ++++++++++++++ src/accl/graph/sega/push_engine.cc | 10 -- src/accl/graph/sega/push_engine.hh | 2 - src/accl/graph/sega/wl_engine.cc | 6 + src/accl/graph/sega/wl_engine.hh | 2 + 9 files changed, 263 insertions(+), 15 deletions(-) create mode 100644 src/accl/graph/sega/CenteralController.py create mode 100644 src/accl/graph/sega/centeral_controller.cc create mode 100644 src/accl/graph/sega/centeral_controller.hh diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 450f158f93..c4288c92d3 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -61,14 +61,18 @@ def __init__(self): self.clk_domain.clock = '1GHz' self.clk_domain.voltage_domain = VoltageDomain() + self.ctrl = CenteralController(addr=0, value=0) self.mpu = MPU(base_edge_addr=0x80000000) self.mem_ctrl = MPUMemory( vertex_range=AddrRange(start=0x000000, size="2GiB"), - vertex_binary="graphs/test-graph/graph_binaries/vertices_0", + vertex_binary="graphs/test/vertices_0", edge_range=AddrRange(start=0x80000000, size="2GiB"), - edge_binary="graphs/test-graph/graph_binaries/edgelist_0") + edge_binary="graphs/test/edgelist_0") + self.interconnect = SystemXBar() - self.mpu.setReqPort(self.mpu.getRespPort()) + self.ctrl.req_port = self.interconnect.cpu_side_ports + self.mpu.setReqPort(self.interconnect.cpu_side_ports) + self.mpu.setRespPort(self.interconnect.mem_side_ports) self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort()) self.mpu.setEdgeMemPort(self.mem_ctrl.getEdgePort()) diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py new file mode 100644 index 0000000000..7b00f8b12d --- /dev/null +++ b/src/accl/graph/sega/CenteralController.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject + +class CenteralController(ClockedObject): + type = 'CenteralController' + cxx_header = "accl/graph/sega/centeral_controller.hh" + cxx_class = 'gem5::CenteralController' + + req_port = RequestPort("Port to send updates to the outside") + addr = Param.Addr("") + value = Param.Int(0, "") diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index 19d702c49a..c8810bbdb2 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -27,10 +27,12 @@ Import('*') +SimObject('CenteralController.py') SimObject('CoalesceEngine.py') SimObject('PushEngine.py') SimObject('WLEngine.py') +Source('centeral_controller.cc') Source('coalesce_engine.cc') Source('push_engine.cc') Source('wl_engine.cc') diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc new file mode 100644 index 0000000000..daa2d9b390 --- /dev/null +++ b/src/accl/graph/sega/centeral_controller.cc @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/centeral_controller.hh" + +#include "mem/packet_access.hh" + +namespace gem5 +{ + +CenteralController::CenteralController + (const CenteralControllerParams ¶ms): + ClockedObject(params), + reqPort(name() + ".req_port", this), + addr(params.addr), + value(params.value) +{} + +Port& +CenteralController::getPort(const std::string &if_name, PortID idx) +{ + if (if_name == "req_port") { + return reqPort; + } else { + return SimObject::getPort(if_name, idx); + } +} + +void +CenteralController::startup() +{ + PacketPtr first_update = + createUpdatePacket(addr, value); + + if (!reqPort.blocked()) { + reqPort.sendPacket(first_update); + } +} + +template PacketPtr +CenteralController::createUpdatePacket(Addr addr, T value) +{ + RequestPtr req = std::make_shared( + addr, sizeof(T), addr, value); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) value) << 2); + + // FIXME: MemCmd::UpdateWL + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + + pkt->allocate(); + // pkt->setData(data); + pkt->setLE(value); + + return pkt; +} + +// AddrRangeList +// CenteralController::ReqPort::getAddrRanges() const +// { +// AddrRangeList ret; +// ret.clear(); +// return ret; +// } + +void +CenteralController::ReqPort::sendPacket(PacketPtr pkt) +{ + panic_if(_blocked, "Should never try to send if blocked MemSide!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + blockedPacket = pkt; + _blocked = true; + } +} + +bool +CenteralController::ReqPort::recvTimingResp(PacketPtr pkt) +{ + panic("recvTimingResp called on the request port."); +} + +void +CenteralController::ReqPort::recvReqRetry() +{ + panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); + + _blocked = false; + sendPacket(blockedPacket); + + if (!_blocked) { + blockedPacket = nullptr; + } +} + +} diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh new file mode 100644 index 0000000000..0e1bb6ac80 --- /dev/null +++ b/src/accl/graph/sega/centeral_controller.hh @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__ +#define __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__ + +#include "accl/graph/base/data_structs.hh" +#include "params/CenteralController.hh" +#include "sim/clocked_object.hh" +#include "sim/system.hh" + +namespace gem5 +{ + +class CenteralController : public ClockedObject +{ + private: + class ReqPort : public RequestPort + { + private: + CenteralController* owner; + bool _blocked; + PacketPtr blockedPacket; + + public: + ReqPort(const std::string& name, CenteralController* owner) : + RequestPort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} + // virtual AddrRangeList getAddrRanges() const; + void sendPacket(PacketPtr pkt); + bool blocked() { return _blocked; } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + ReqPort reqPort; + + Addr addr; + uint32_t value; + + template PacketPtr + createUpdatePacket(Addr addr, T value); + + virtual void startup(); + + public: + PARAMS(CenteralController); + CenteralController(const CenteralControllerParams ¶ms); + + Port& getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; +}; + +} + +#endif // __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__ diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 716daf92e8..ddfc2edef8 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -55,16 +55,6 @@ PushEngine::getPort(const std::string &if_name, PortID idx) } } -void -PushEngine::startup() -{ - PacketPtr first_update = createUpdatePacket(0, (uint32_t) 0); - - if (!reqPort.blocked()) { - reqPort.sendPacket(first_update); - } -} - void PushEngine::ReqPort::sendPacket(PacketPtr pkt) { diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 5e8b079d88..ce9045e91a 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -112,8 +112,6 @@ class PushEngine : public BaseMemEngine // always be limited by the b/w of the memory. std::deque memRespQueue; - virtual void startup(); - template PacketPtr createUpdatePacket(Addr addr, T value); EventFunctionWrapper nextAddrGenEvent; diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index ad9e93ba60..40fca42d26 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -58,6 +58,12 @@ WLEngine::getPort(const std::string &if_name, PortID idx) } } +void +WLEngine::init() +{ + respPort.sendRangeChange(); +} + AddrRangeList WLEngine::RespPort::getAddrRanges() const { diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 12df93ee79..2698ce3ea8 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -65,6 +65,8 @@ class WLEngine : public BaseReduceEngine virtual void recvRespRetry(); }; + virtual void init(); + RespPort respPort; bool blockedByCoalescer; From a95da7b0dc83e976b444f5304e818ffe96adf90e Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 22 Apr 2022 11:44:24 -0700 Subject: [PATCH 095/247] Adding UpdateWL as a MemCmd and fixing code. --- configs/accl/sega.py | 5 +- src/accl/graph/TODO.md | 5 + src/accl/graph/base/data_structs.hh | 3 + src/accl/graph/sega/centeral_controller.cc | 14 +- src/accl/graph/sega/coalesce_engine.cc | 195 +++++++++------------ src/accl/graph/sega/push_engine.cc | 2 +- src/accl/graph/sega/wl_engine.cc | 31 +--- src/mem/packet.cc | 40 +---- src/mem/packet.hh | 4 +- 9 files changed, 105 insertions(+), 194 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index c4288c92d3..aa3675d847 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -65,14 +65,15 @@ def __init__(self): self.mpu = MPU(base_edge_addr=0x80000000) self.mem_ctrl = MPUMemory( vertex_range=AddrRange(start=0x000000, size="2GiB"), - vertex_binary="graphs/test/vertices_0", + vertex_binary="graphs/epinions/graph_binaries/vertices_0", edge_range=AddrRange(start=0x80000000, size="2GiB"), - edge_binary="graphs/test/edgelist_0") + edge_binary="graphs/epinions/graph_binaries/edgelist_0") self.interconnect = SystemXBar() self.ctrl.req_port = self.interconnect.cpu_side_ports self.mpu.setReqPort(self.interconnect.cpu_side_ports) self.mpu.setRespPort(self.interconnect.mem_side_ports) + self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort()) self.mpu.setEdgeMemPort(self.mem_ctrl.getEdgePort()) diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md index 1cec4dc6f9..f5690a3faa 100644 --- a/src/accl/graph/TODO.md +++ b/src/accl/graph/TODO.md @@ -1,3 +1,8 @@ # TODO Items + * implement all the communications between simobjects as req/retry. * get rid of maps with RequestPtr as keys +* add UpdateWL as a MemCmd +* Replace std::floor with roundDown from intmath.hh in src +* We might need to revisit the fact that we could insert something to a queue on + the same cycle that another event is consuming something from the queue. diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index 409245eeaa..7535d4bbac 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -30,6 +30,7 @@ #define __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__ #include "base/cprintf.hh" +#include "base/intmath.hh" namespace gem5 { @@ -81,6 +82,8 @@ struct __attribute__ ((packed)) Edge {} }; +static_assert(isPowerOf2(sizeof(WorkListItem))); + } #endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__ diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index daa2d9b390..41ebeb9cd6 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -54,8 +54,7 @@ CenteralController::getPort(const std::string &if_name, PortID idx) void CenteralController::startup() { - PacketPtr first_update = - createUpdatePacket(addr, value); + PacketPtr first_update = createUpdatePacket(addr, value); if (!reqPort.blocked()) { reqPort.sendPacket(first_update); @@ -71,8 +70,7 @@ CenteralController::createUpdatePacket(Addr addr, T value) // bits req->setPC(((Addr) value) << 2); - // FIXME: MemCmd::UpdateWL - PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + PacketPtr pkt = new Packet(req, MemCmd::UpdateWL); pkt->allocate(); // pkt->setData(data); @@ -81,14 +79,6 @@ CenteralController::createUpdatePacket(Addr addr, T value) return pkt; } -// AddrRangeList -// CenteralController::ReqPort::getAddrRanges() const -// { -// AddrRangeList ret; -// ret.clear(); -// return ret; -// } - void CenteralController::ReqPort::sendPacket(PacketPtr pkt) { diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index e54447fd09..e6503ea01d 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -29,6 +29,7 @@ #include "accl/graph/sega/coalesce_engine.hh" #include "accl/graph/sega/wl_engine.hh" +#include "base/intmath.hh" #include "debug/ApplyUpdates.hh" #include "debug/MPU.hh" #include "mem/packet_access.hh" @@ -47,6 +48,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()), stats(*this) { + assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine)); cacheBlocks = new Block [numLines]; for (int i = 0; i < numLines; i++) { cacheBlocks[i] = Block(numElementsPerLine); @@ -72,18 +74,25 @@ CoalesceEngine::recvReadAddr(Addr addr) DPRINTF(MPU, "%s: Received a read request for address: %lu.\n", __func__, addr); Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize; + assert(aligned_addr % peerMemoryAtomSize == 0); int block_index = (aligned_addr / peerMemoryAtomSize) % numLines; + assert(block_index < numLines); int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); + assert(wl_offset < numElementsPerLine); if ((cacheBlocks[block_index].addr == aligned_addr) && (cacheBlocks[block_index].valid)) { // Hit + // TODO: Add a hit latency as a param for this object. + // Can't just schedule the nextRespondEvent for latency cycles in + // the future. responseQueue.push_back(std::make_tuple(addr, cacheBlocks[block_index].items[wl_offset])); DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s " "to responseQueue. responseQueue.size = %d.\n", __func__, addr, block_index, wl_offset, responseQueue.size(), cacheBlocks[block_index].items[wl_offset].to_string()); + // TODO: Add a stat to count the number of WLItems that have been touched. cacheBlocks[block_index].takenMask |= (1 << wl_offset); stats.readHits++; @@ -104,6 +113,8 @@ CoalesceEngine::recvReadAddr(Addr addr) // Out of MSHR entries DPRINTF(MPU, "%s: Out of MSHR entries. " "Rejecting request.\n", __func__); + // TODO: Break out read rejections into more than one stat + // based on the cause of the rejection stats.readRejections++; return false; } else { @@ -200,6 +211,7 @@ CoalesceEngine::recvReadAddr(Addr addr) } } +// TODO: For loop to empty the entire responseQueue. void CoalesceEngine::processNextRespondEvent() { @@ -241,8 +253,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) } Addr addr = pkt->getAddr(); - uint8_t* data = pkt->getPtr(); - int block_index = (addr / peerMemoryAtomSize) % numLines; DPRINTF(MPU, "%s: Received a read resposne for Addr: %lu.\n", @@ -250,17 +260,17 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) assert((cacheBlocks[block_index].allocated) && // allocated cache block (!cacheBlocks[block_index].valid) && // valid is false (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR + pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items, + peerMemoryAtomSize); for (int i = 0; i < numElementsPerLine; i++) { - cacheBlocks[block_index].items[i] = *((WorkListItem*) ( - data + (i * sizeof(WorkListItem)))); DPRINTF(MPU, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__, block_index, i, cacheBlocks[block_index].items[i].to_string()); } cacheBlocks[block_index].valid = true; delete pkt; - int bias = 0; + // FIXME: Get rid of servicedIndices (maybe use an iterator) std::vector servicedIndices; for (int i = 0; i < MSHRMap[block_index].size(); i++) { Addr miss_addr = MSHRMap[block_index][i]; @@ -271,20 +281,26 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could " "be serviced with the received packet.\n", __func__, miss_addr, block_index); + // TODO: Make this block of code into a function responseQueue.push_back(std::make_tuple(miss_addr, cacheBlocks[block_index].items[wl_offset])); DPRINTF(MPU, "%s: Pushed cache line[%d][%d] to " "responseQueue. responseQueue.size = %u.\n" , __func__, block_index, wl_offset, responseQueue.size()); + // TODO: Add a stat to count the number of WLItems that have been touched. cacheBlocks[block_index].takenMask |= (1 << wl_offset); + // End of the said block servicedIndices.push_back(i); DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for " "removal.\n", __func__, i, block_index); } } + // TODO: We Can use taken instead of this + // TODO: Change the MSHRMap from map to map + int bias = 0; for (int i = 0; i < servicedIndices.size(); i++) { Addr print_addr = MSHRMap[block_index][i - bias]; MSHRMap[block_index].erase(MSHRMap[block_index].begin() + @@ -298,8 +314,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) MSHRMap.erase(block_index); cacheBlocks[block_index].hasConflict = false; } else { - // TODO: I think this is unnecessary. - cacheBlocks[block_index].hasConflict = true; + assert(cacheBlocks[block_index].hasConflict); } if ((!nextRespondEvent.scheduled()) && @@ -341,11 +356,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) { cacheBlocks[block_index].hasChange = true; + stats.numVertexWrites++; } cacheBlocks[block_index].items[wl_offset] = wl; cacheBlocks[block_index].takenMask &= ~(1 << wl_offset); - stats.numVertexWrites++; DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index, cacheBlocks[block_index].items[wl_offset].to_string()); // TODO: Make this more general and programmable. @@ -380,8 +395,9 @@ CoalesceEngine::processNextApplyAndCommitEvent() __func__, block_index); DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and " "then commited.\n", __func__, block_index); + if (cacheBlocks[block_index].takenMask == 0) { - if ((cacheBlocks[block_index].hasChange)&& + if ((cacheBlocks[block_index].hasChange) && (cacheBlocks[block_index].hasConflict) && (memReqQueueHasSpace(2))) { DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n", @@ -420,6 +436,7 @@ CoalesceEngine::processNextApplyAndCommitEvent() cacheBlocks[block_index].items[i].to_string()); if (old_prop != cacheBlocks[block_index].items[i].prop) { changedMask |= (1 << i); + // TODO: Add a stat to count the number of changed props. DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n", __func__, block_index, i); } @@ -434,117 +451,65 @@ CoalesceEngine::processNextApplyAndCommitEvent() (uint8_t*) cacheBlocks[block_index].items); DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = %d.\n", __func__, write_pkt->getAddr(), peerMemoryAtomSize); - if (cacheBlocks[block_index].hasConflict) { - DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is " - "enough space in outstandingMemReqQueue for the write " - "back packet and its subsequent read packet.\n", - __func__, block_index); - Addr miss_addr = MSHRMap[block_index][0]; - DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is" - " Addr: %lu.\n", __func__, block_index, miss_addr); - - Addr aligned_miss_addr = - std::floor(miss_addr / peerMemoryAtomSize) * - peerMemoryAtomSize; - PacketPtr read_pkt = createReadPacket( - aligned_miss_addr, peerMemoryAtomSize); - DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." - " req addr (aligned_addr) = %lu, size = %d.\n", - __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize); - - enqueueMemReq(write_pkt); - enqueueMemReq(read_pkt); - DPRINTF(MPU, "%s: Added the evicting write back packet along with " - "its subsequent read packet (to service the conflicts)" - " to outstandingMemReqQueue.\n" , __func__); - - for (int i = 0; i < numElementsPerLine; i++) { - if ((changedMask & (1 << i)) == (1 << i)) { - DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", __func__, - cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)), - cacheBlocks[block_index].items[i].to_string()); - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, i); - } - } - - cacheBlocks[block_index].addr = aligned_miss_addr; - cacheBlocks[block_index].takenMask = 0; - cacheBlocks[block_index].allocated = true; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].hasConflict = true; - cacheBlocks[block_index].hasChange = false; - DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size " - " = %u.\n", __func__, evictQueue.size()); - } else { - DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is " - "enough space in outstandingMemReqQueue for the write back" - " packet.\n", __func__, block_index); - enqueueMemReq(write_pkt); - DPRINTF(MPU, "%s: Added the write back packet to " - "outstandingMemReqQueue.\n", __func__); - - for (int i = 0; i < numElementsPerLine; i++) { - if ((changedMask & (1 << i)) == (1 << i)) { - DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", __func__, - cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)), - cacheBlocks[block_index].items[i].to_string()); - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, i); - } + enqueueMemReq(write_pkt); + DPRINTF(MPU, "%s: Added the evicting write back packet to " + "outstandingMemReqQueue.\n" , __func__); + + for (int i = 0; i < numElementsPerLine; i++) { + if ((changedMask & (1 << i)) == (1 << i)) { + DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", + __func__, cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)), + cacheBlocks[block_index].items[i].to_string()); + peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]); + DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", + __func__, block_index, i); } - - // Since allocated is false, does not matter what the address is. - - cacheBlocks[block_index].takenMask = 0; - cacheBlocks[block_index].allocated = false; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].hasConflict = false; - cacheBlocks[block_index].hasChange = false; - DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size " - " = %u.\n", __func__, evictQueue.size()); } - } else { - DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write " - "backs are necessary.\n", __func__, block_index); - if (cacheBlocks[block_index].hasConflict) { - DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is " - "enough space in outstandingMemReqQueue for the write " - "back packet and its subsequent read packet.\n", - __func__, block_index); - Addr miss_addr = MSHRMap[block_index][0]; - DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is" - " Addr: %lu.\n", __func__, block_index, miss_addr); + } - Addr aligned_miss_addr = - std::floor(miss_addr / peerMemoryAtomSize) * + if (cacheBlocks[block_index].hasConflict) { + assert(!MSHRMap[block_index].empty()); + DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is " + "enough space in outstandingMemReqQueue for a read " + "packet.\n", __func__, block_index); + Addr miss_addr = MSHRMap[block_index][0]; + DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is" + " Addr: %lu.\n", __func__, block_index, miss_addr); + + Addr aligned_miss_addr = + std::floor(miss_addr / peerMemoryAtomSize) * peerMemoryAtomSize; - PacketPtr read_pkt = createReadPacket( - aligned_miss_addr, peerMemoryAtomSize); - DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." - " req addr (aligned_addr) = %lu, size = %d.\n", - __func__, miss_addr, aligned_miss_addr); - enqueueMemReq(read_pkt); - - cacheBlocks[block_index].addr = aligned_miss_addr; - cacheBlocks[block_index].takenMask = 0; - cacheBlocks[block_index].allocated = true; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].hasConflict = true; - cacheBlocks[block_index].hasChange = false; - } else { - DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just " - "deallocating the line.\n", __func__, block_index); - - cacheBlocks[block_index].takenMask = 0; - cacheBlocks[block_index].allocated = false; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].hasConflict = false; - cacheBlocks[block_index].hasChange = false; - } + PacketPtr read_pkt = createReadPacket(aligned_miss_addr, + peerMemoryAtomSize); + DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." + " req addr (aligned_addr) = %lu, size = %d.\n", + __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize); + enqueueMemReq(read_pkt); + DPRINTF(MPU, "%s: Added the evicting write back packet along with " + "its subsequent read packet (to service the conflicts)" + " to outstandingMemReqQueue.\n" , __func__); + + cacheBlocks[block_index].addr = aligned_miss_addr; + cacheBlocks[block_index].takenMask = 0; + cacheBlocks[block_index].allocated = true; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].hasConflict = true; + cacheBlocks[block_index].hasChange = false; + } else { + DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is " + "enough space in outstandingMemReqQueue for the write back" + " packet.\n", __func__, block_index); + DPRINTF(MPU, "%s: Added the write back packet to " + "outstandingMemReqQueue.\n", __func__); + + // Since allocated is false, does not matter what the address is. + cacheBlocks[block_index].takenMask = 0; + cacheBlocks[block_index].allocated = false; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].hasConflict = false; + cacheBlocks[block_index].hasChange = false; } + } else { DPRINTF(MPU, "%s: cache line[%d] has been read since being scheduled " "for eviction. Therefore, ignoring the evict schedule.\n", diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index ddfc2edef8..e822b7168b 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -238,7 +238,7 @@ PushEngine::createUpdatePacket(Addr addr, T value) req->setPC(((Addr) _requestorId) << 2); // FIXME: MemCmd::UpdateWL - PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + PacketPtr pkt = new Packet(req, MemCmd::UpdateWL); pkt->allocate(); // pkt->setData(data); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 40fca42d26..148f5de5be 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -121,6 +121,8 @@ WLEngine::getAddrRanges() const return coalesceEngine->getAddrRanges(); } +// TODO: Parameterize the number of pops WLEngine can do at a time. +// TODO: Add a histogram stats of the size of the updateQueue. Sample here. void WLEngine::processNextReadEvent() { @@ -144,9 +146,7 @@ WLEngine::processNextReadEvent() DPRINTF(MPU, "%s: Popped an item from the front of updateQueue" ". updateQueue.size = %u.\n", __func__, updateQueue.size()); - if (updateQueue.size() == updateQueueSize - 1) { - respPort.checkRetryReq(); - } + respPort.checkRetryReq(); } } } else { @@ -164,9 +164,7 @@ WLEngine::processNextReadEvent() DPRINTF(MPU, "%s: Popped an item from the front of updateQueue" ". updateQueue.size = %u.\n", __func__, updateQueue.size()); - if (updateQueue.size() == updateQueueSize - 1) { - respPort.checkRetryReq(); - } + respPort.checkRetryReq(); } // TODO: Only schedule nextReadEvent only when it has to be scheduled @@ -194,12 +192,9 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl) void WLEngine::processNextReduceEvent() { - std::unordered_map::iterator it = - addrWorkListMap.begin(); - - std::vector servicedAddresses; - while (it != addrWorkListMap.end()) { - Addr addr = it->first; + for (auto &it : addrWorkListMap) { + Addr addr = it.first; + assert(onTheFlyUpdateMap.find(addr) != onTheFlyUpdateMap.end()); uint32_t update_value = onTheFlyUpdateMap[addr]; DPRINTF(MPU, "%s: Reducing between onTheFlyUpdateMap and " "addrWorkListMap values. onTheFlyUpdateMap[%lu] = %u, " @@ -214,17 +209,9 @@ WLEngine::processNextReduceEvent() stats.numReduce++; coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]); - servicedAddresses.push_back(addr); - DPRINTF(MPU, "%s: Added addr: %lu to servicedAdresses.\n", - __func__, addr); - it++; - } - - addrWorkListMap.clear(); - for (int i = 0; i < servicedAddresses.size(); i++) { - onTheFlyUpdateMap.erase(servicedAddresses[i]); + onTheFlyUpdateMap.erase(addr); DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n", - __func__, servicedAddresses[i]); + __func__, addr); } } diff --git a/src/mem/packet.cc b/src/mem/packet.cc index da45246e49..daf9d18e88 100644 --- a/src/mem/packet.cc +++ b/src/mem/packet.cc @@ -237,6 +237,7 @@ MemCmd::commandInfo[] = { {IsRead, IsResponse}, InvalidCmd, "HTMReqResp" }, { {IsRead, IsRequest}, InvalidCmd, "HTMAbort" }, { {IsRequest}, InvalidCmd, "TlbiExtSync" }, + { {IsRequest, HasData}, InvalidCmd, "UpdateWL"} }; AddrRange @@ -532,43 +533,4 @@ Packet::getHtmTransactionUid() const return htmTransactionUid; } -std::string -Packet::printData() -{ - char ret[1024]; - if (isWrite()) { - uint8_t* data = getPtr(); - std::sprintf(ret,"\n" - "V[%lu] temp_prop: %u, prop: %u, " - "degree: %u, edgeIndex: %u.\n" - "V[%lu] temp_prop: %u, prop: %u, " - "degree: %u, edgeIndex: %u.\n" - "V[%lu] temp_prop: %u, prop: %u, " - "degree: %u, edgeIndex: %u.\n" - "V[%lu] temp_prop: %u, prop: %u, " - "degree: %u, edgeIndex: %u.\n", - getAddr(), - *((uint32_t*) data), - *((uint32_t*) (data + 4)), - *((uint32_t*) (data + 8)), - *((uint32_t*) (data + 12)), - getAddr() + 16, - *((uint32_t*) (data + 16)), - *((uint32_t*) (data + 20)), - *((uint32_t*) (data + 24)), - *((uint32_t*) (data + 28)), - getAddr() + 32, - *((uint32_t*) (data + 32)), - *((uint32_t*) (data + 36)), - *((uint32_t*) (data + 40)), - *((uint32_t*) (data + 44)), - getAddr() + 48, - *((uint32_t*) (data + 48)), - *((uint32_t*) (data + 52)), - *((uint32_t*) (data + 56)), - *((uint32_t*) (data + 60))); - } - return ret; -} - } // namespace gem5 diff --git a/src/mem/packet.hh b/src/mem/packet.hh index 8803eacced..5332ee32a2 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -149,7 +149,7 @@ class MemCmd // Tlb shootdown TlbiExtSync, // MPU Accelerator - // UpdateWL, + UpdateWL, NUM_MEM_CMDS }; @@ -1374,8 +1374,6 @@ class Packet : public Printable template void setRaw(T v); - std::string printData(); - public: /** * Check a functional request against a memory value stored in From e4b665c796dbe348a511585c3eb2c1b3d87630b4 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 24 Apr 2022 20:28:25 -0700 Subject: [PATCH 096/247] A little bit of debugging and updating config script. --- configs/accl/sega.py | 138 +++++++++++++++++++++++-------- src/accl/graph/TODO.md | 5 +- src/accl/graph/sega/wl_engine.cc | 1 + src/accl/graph/sega/wl_engine.hh | 2 +- 4 files changed, 105 insertions(+), 41 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index aa3675d847..9dd8c0f358 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -1,5 +1,9 @@ import m5 +import argparse + +from math import log from m5.objects import * +from m5.util.convert import toMemorySize class MPU(SubSystem): def __init__(self, base_edge_addr): @@ -35,53 +39,115 @@ def setEdgeMemPort(self, port): self.push_engine.mem_port = port class MPUMemory(SubSystem): - def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary): + def __init__(self, + num_channels: int, + cache_line_size: int, + vertex_memory_size: str, + edge_memory_size: str, + graph_path: str): super(MPUMemory, self).__init__() - self.vertex_mem_ctrl = SimpleMemory( - range=vertex_range, bandwidth="19.2GB/s", - latency="30ns", image_file=vertex_binary) - self.edge_mem_ctrl = SimpleMemory( - range=edge_range, bandwidth="19.2GB/s", - latency="30ns", image_file=edge_binary) - - def getVertexPort(self): - return self.vertex_mem_ctrl.port - def setVertexPort(self, port): - self.vertex_mem_ctrl.port = port - - def getEdgePort(self): - return self.edge_mem_ctrl.port - def setEdgePort(self, port): - self.edge_mem_ctrl.port = port + + self._vertex_ranges = self._interleave_addresses( + AddrRange(start=0, size=vertex_memory_size),\ + num_channels,\ + cache_line_size) + + self._edge_chunk_size = int(\ + toMemorySize(edge_memory_size)/num_channels) + self._edge_ranges = [AddrRange(\ + start=toMemorySize(vertex_memory_size)+\ + self._edge_chunk_size*i,\ + size=self._edge_chunk_size)\ + for i in range(num_channels)] + + vertex_mem_ctrl = [] + edge_mem_ctrl = [] + for i in range(num_channels): + vertex_mem_ctrl.append( + SimpleMemory(range=self._vertex_ranges[i], + bandwidth="19.2GB/s", + latency="30ns", + image_file=f"{graph_path}/vertices_{i}") + ) + edge_mem_ctrl.append( + SimpleMemory(range=self._edge_ranges[i], + bandwidth="19.2GB/s", + latency="30ns", + image_file=f"{graph_path}/edgelist_{i}") + ) + self.vertex_mem_ctrl = vertex_mem_ctrl + self.edge_mem_ctrl = edge_mem_ctrl + + def _interleave_addresses(self, + plain_range, + num_channels, + cache_line_size): + intlv_low_bit = log(cache_line_size, 2) + intlv_bits = log(num_channels, 2) + ret = [] + for i in range(num_channels): + ret.append(AddrRange( + start=plain_range.start, + size=plain_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + xorHighBit=0, + intlvBits=intlv_bits, + intlvMatch=i)) + return ret + + def getVertexPort(self, i): + return self.vertex_mem_ctrl[i].port + def setVertexPort(self, port, i): + self.vertex_mem_ctrl[i].port = port + + def getEdgeBaseAddr(self, i): + return self._edge_ranges[i].start + def getEdgePort(self, i): + return self.edge_mem_ctrl[i].port + def setEdgePort(self, port, i): + self.edge_mem_ctrl[i].port = port class SEGA(System): - def __init__(self): + def __init__(self, num_mpus, graph_path): super(SEGA, self).__init__() self.clk_domain = SrcClockDomain() self.clk_domain.clock = '1GHz' self.clk_domain.voltage_domain = VoltageDomain() - self.ctrl = CenteralController(addr=0, value=0) - self.mpu = MPU(base_edge_addr=0x80000000) - self.mem_ctrl = MPUMemory( - vertex_range=AddrRange(start=0x000000, size="2GiB"), - vertex_binary="graphs/epinions/graph_binaries/vertices_0", - edge_range=AddrRange(start=0x80000000, size="2GiB"), - edge_binary="graphs/epinions/graph_binaries/edgelist_0") - self.interconnect = SystemXBar() + self.interconnect = NoncoherentXBar(frontend_latency=1, + forward_latency=1, + response_latency=1, + width=64) + self.ctrl = CenteralController(addr=0, value=0) self.ctrl.req_port = self.interconnect.cpu_side_ports - self.mpu.setReqPort(self.interconnect.cpu_side_ports) - self.mpu.setRespPort(self.interconnect.mem_side_ports) - self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort()) - self.mpu.setEdgeMemPort(self.mem_ctrl.getEdgePort()) + self.mem_ctrl = MPUMemory(num_mpus, 32, "2GiB", "2GiB", graph_path) + + mpus = [] + for i in range(num_mpus): + mpus.append(MPU(base_edge_addr=self.mem_ctrl.getEdgeBaseAddr(i))) + mpus[i].setReqPort(self.interconnect.cpu_side_ports) + mpus[i].setRespPort(self.interconnect.mem_side_ports) + mpus[i].setVertexMemPort(self.mem_ctrl.getVertexPort(i)) + mpus[i].setEdgeMemPort(self.mem_ctrl.getEdgePort(i)) + self.mpu = mpus + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_mpus", type=int) + argparser.add_argument("graph_path", type=str) + args = argparser.parse_args() + return args.num_mpus, args.graph_path -system = SEGA() -root = Root(full_system = False, system = system) +if __name__ == "__m5_main__": + num_mpus, graph_path = get_inputs() + print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}") + system = SEGA(num_mpus, graph_path) + root = Root(full_system = False, system = system) -m5.instantiate() + m5.instantiate() -exit_event = m5.simulate() -print("Simulation finished!") -exit() + exit_event = m5.simulate() + print("Simulation finished!") + exit() diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md index f5690a3faa..29b5a2939e 100644 --- a/src/accl/graph/TODO.md +++ b/src/accl/graph/TODO.md @@ -1,8 +1,5 @@ # TODO Items -* implement all the communications between simobjects as req/retry. -* get rid of maps with RequestPtr as keys -* add UpdateWL as a MemCmd * Replace std::floor with roundDown from intmath.hh in src * We might need to revisit the fact that we could insert something to a queue on - the same cycle that another event is consuming something from the queue. + the same cycle that another event is consuming something from the queue. \ No newline at end of file diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 148f5de5be..e949cbcf5b 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -213,6 +213,7 @@ WLEngine::processNextReduceEvent() DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n", __func__, addr); } + addrWorkListMap.clear(); } bool diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 2698ce3ea8..597fdb2b1e 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -52,7 +52,7 @@ class WLEngine : public BaseReduceEngine public: RespPort(const std::string& name, WLEngine* owner): - ResponsePort(name, owner), owner(owner) + ResponsePort(name, owner), owner(owner), needSendRetryReq(false) {} virtual AddrRangeList getAddrRanges() const; From c8b7b26fcc071883bb70cbaf31b936249a4b20be Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 25 Apr 2022 16:56:04 -0700 Subject: [PATCH 097/247] Adding initState to CenteralController. --- configs/accl/sega.py | 23 ++++++++++------ src/accl/graph/sega/CenteralController.py | 3 ++ src/accl/graph/sega/centeral_controller.cc | 32 ++++++++++++++++++++++ src/accl/graph/sega/centeral_controller.hh | 6 +++- 4 files changed, 55 insertions(+), 9 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 9dd8c0f358..0907ba77de 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -66,8 +66,7 @@ def __init__(self, vertex_mem_ctrl.append( SimpleMemory(range=self._vertex_ranges[i], bandwidth="19.2GB/s", - latency="30ns", - image_file=f"{graph_path}/vertices_{i}") + latency="30ns") ) edge_mem_ctrl.append( SimpleMemory(range=self._edge_ranges[i], @@ -108,21 +107,28 @@ def setEdgePort(self, port, i): self.edge_mem_ctrl[i].port = port class SEGA(System): - def __init__(self, num_mpus, graph_path): + def __init__(self, num_mpus, vertex_cache_line_size, graph_path): super(SEGA, self).__init__() self.clk_domain = SrcClockDomain() self.clk_domain.clock = '1GHz' self.clk_domain.voltage_domain = VoltageDomain() + self.cache_line_size = vertex_cache_line_size self.interconnect = NoncoherentXBar(frontend_latency=1, forward_latency=1, response_latency=1, width=64) - self.ctrl = CenteralController(addr=0, value=0) + self.ctrl = CenteralController(addr=0, value=0, + image_file=f"{graph_path}/vertices") self.ctrl.req_port = self.interconnect.cpu_side_ports - self.mem_ctrl = MPUMemory(num_mpus, 32, "2GiB", "2GiB", graph_path) + self.mem_ctrl = MPUMemory( + num_mpus, + self.cache_line_size, + "2GiB", + "2GiB", + graph_path) mpus = [] for i in range(num_mpus): @@ -136,14 +142,15 @@ def __init__(self, num_mpus, graph_path): def get_inputs(): argparser = argparse.ArgumentParser() argparser.add_argument("num_mpus", type=int) + argparser.add_argument("vertex_cache_line_size", type=int) argparser.add_argument("graph_path", type=str) args = argparser.parse_args() - return args.num_mpus, args.graph_path + return args.num_mpus, args.vertex_cache_line_size, args.graph_path if __name__ == "__m5_main__": - num_mpus, graph_path = get_inputs() + num_mpus, vertex_cache_line_size, graph_path = get_inputs() print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}") - system = SEGA(num_mpus, graph_path) + system = SEGA(num_mpus, vertex_cache_line_size, graph_path) root = Root(full_system = False, system = system) m5.instantiate() diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index 7b00f8b12d..bd2f6320a8 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -34,6 +34,9 @@ class CenteralController(ClockedObject): cxx_header = "accl/graph/sega/centeral_controller.hh" cxx_class = 'gem5::CenteralController' + system = Param.System(Parent.any, "System this Engine is a part of") req_port = RequestPort("Port to send updates to the outside") addr = Param.Addr("") value = Param.Int(0, "") + + image_file = Param.String("Path to the global memory image.") diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 41ebeb9cd6..3c05972224 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -28,6 +28,9 @@ #include "accl/graph/sega/centeral_controller.hh" +#include "base/loader/memory_image.hh" +#include "base/loader/object_file.hh" +#include "debug/MPU.hh" #include "mem/packet_access.hh" namespace gem5 @@ -36,6 +39,7 @@ namespace gem5 CenteralController::CenteralController (const CenteralControllerParams ¶ms): ClockedObject(params), + system(params.system), reqPort(name() + ".req_port", this), addr(params.addr), value(params.value) @@ -51,6 +55,26 @@ CenteralController::getPort(const std::string &if_name, PortID idx) } } +void +CenteralController::initState() +{ + ClockedObject::initState(); + + const auto &file = params().image_file; + if (file == "") + return; + + auto *object = loader::createObjectFile(file, true); + fatal_if(!object, "%s: Could not load %s.", name(), file); + + loader::debugSymbolTable.insert(*object->symtab().globals()); + loader::MemoryImage image = object->buildImage(); + PortProxy proxy([this](PacketPtr pkt) { functionalAccess(pkt); }, + system->cacheLineSize()); + + panic_if(!image.write(proxy), "%s: Unable to write image."); +} + void CenteralController::startup() { @@ -110,4 +134,12 @@ CenteralController::ReqPort::recvReqRetry() } } +void +CenteralController::functionalAccess(PacketPtr pkt) +{ + DPRINTF(MPU, "%s: Functional access for pkt->addr: %lu, pkt->size: %lu.\n", + __func__, pkt->getAddr(), pkt->getSize()); + reqPort.sendFunctional(pkt); +} + } diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index 0e1bb6ac80..102800de92 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -61,16 +61,20 @@ class CenteralController : public ClockedObject virtual void recvReqRetry(); }; + System* system; ReqPort reqPort; Addr addr; uint32_t value; - template PacketPtr + template PacketPtr createUpdatePacket(Addr addr, T value); + virtual void initState(); virtual void startup(); + void functionalAccess(PacketPtr pkt); + public: PARAMS(CenteralController); CenteralController(const CenteralControllerParams ¶ms); From f0bf6143f964c3ddbd5197d1d77efee8fe0381e8 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 29 Apr 2022 15:28:06 -0700 Subject: [PATCH 098/247] Changing debug flag for CenteralController. --- src/accl/graph/sega/SConscript | 1 + src/accl/graph/sega/centeral_controller.cc | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index c8810bbdb2..16fab86ede 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -38,3 +38,4 @@ Source('push_engine.cc') Source('wl_engine.cc') DebugFlag('ApplyUpdates') +DebugFlag('CenteralController') diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 3c05972224..f19c93ebac 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -30,7 +30,7 @@ #include "base/loader/memory_image.hh" #include "base/loader/object_file.hh" -#include "debug/MPU.hh" +#include "debug/CenteralController.hh" #include "mem/packet_access.hh" namespace gem5 @@ -137,7 +137,8 @@ CenteralController::ReqPort::recvReqRetry() void CenteralController::functionalAccess(PacketPtr pkt) { - DPRINTF(MPU, "%s: Functional access for pkt->addr: %lu, pkt->size: %lu.\n", + DPRINTF(CenteralController, + "%s: Functional access for pkt->addr: %lu, pkt->size: %lu.\n", __func__, pkt->getAddr(), pkt->getSize()); reqPort.sendFunctional(pkt); } From 4485e3b2b981fc620daabd7470d8bc8d9adcf978 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 3 May 2022 09:33:52 -0700 Subject: [PATCH 099/247] Fixing a bug and adding new stats. --- configs/accl/sega.py | 9 ++++++--- src/accl/graph/sega/coalesce_engine.cc | 4 +++- src/accl/graph/sega/coalesce_engine.hh | 1 + src/accl/graph/sega/push_engine.cc | 19 ++++++++++++++++++- src/accl/graph/sega/push_engine.hh | 13 +++++++++++++ 5 files changed, 41 insertions(+), 5 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 0907ba77de..bfdad58f72 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -9,13 +9,15 @@ class MPU(SubSystem): def __init__(self, base_edge_addr): super(MPU, self).__init__() self.push_engine = PushEngine(base_edge_addr=base_edge_addr, - push_req_queue_size=16, + push_req_queue_size=0, attached_memory_atom_size=64) self.coalesce_engine = CoalesceEngine( peer_push_engine=self.push_engine, - attached_memory_atom_size=32) + attached_memory_atom_size=32, + cache_size="1MiB", + num_mshr_entry=16) self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, - update_queue_size=16, + update_queue_size=32, on_the_fly_update_map_size=8) def getRespPort(self): @@ -113,6 +115,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path): self.clk_domain.clock = '1GHz' self.clk_domain.voltage_domain = VoltageDomain() self.cache_line_size = vertex_cache_line_size + self.mem_mode = "timing" self.interconnect = NoncoherentXBar(frontend_latency=1, forward_latency=1, diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index e6503ea01d..fbe593507a 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -199,7 +199,7 @@ CoalesceEngine::recvReadAddr(Addr addr) if (aligned_addr != cacheBlocks[block_index].addr) { stats.readMisses++; } else { - stats.readHits++; + stats.readHitUnderMisses++; } MSHRMap[block_index].push_back(addr); @@ -538,6 +538,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "Number of cache hits."), ADD_STAT(readMisses, statistics::units::Count::get(), "Number of cache misses."), + ADD_STAT(readHitUnderMisses, statistics::units::Count::get(), + "Number of cache hit under misses."), ADD_STAT(readRejections, statistics::units::Count::get(), "Number of cache rejections.") { diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index efd19d3e9b..ce019ef969 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -106,6 +106,7 @@ class CoalesceEngine : public BaseMemEngine statistics::Scalar numVertexWrites; statistics::Scalar readHits; statistics::Scalar readMisses; + statistics::Scalar readHitUnderMisses; statistics::Scalar readRejections; }; diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index e822b7168b..69b9f3f23e 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -40,7 +40,8 @@ PushEngine::PushEngine(const PushEngineParams ¶ms): baseEdgeAddr(params.base_edge_addr), pushReqQueueSize(params.push_req_queue_size), nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()), - nextPushEvent([this] { processNextPushEvent(); }, name()) + nextPushEvent([this] { processNextPushEvent(); }, name()), + stats(*this) {} Port& @@ -207,6 +208,7 @@ PushEngine::processNextPushEvent() if (!reqPort.blocked()) { reqPort.sendPacket(update); + stats.numUpdates++; DPRINTF(MPU, "%s: Sent a push update to addr: %lu with value: %d.\n", __func__, curr_edge->neighbor, update_value); reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge); @@ -247,4 +249,19 @@ PushEngine::createUpdatePacket(Addr addr, T value) return pkt; } +PushEngine::PushStats::PushStats(PushEngine &_push) + : statistics::Group(&_push), + push(_push), + + ADD_STAT(numUpdates, statistics::units::Count::get(), + "Number of sent updates.") +{ +} + +void +PushEngine::PushStats::regStats() +{ + using namespace statistics; +} + } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index ce9045e91a..7a6981daa0 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -120,6 +120,19 @@ class PushEngine : public BaseMemEngine EventFunctionWrapper nextPushEvent; void processNextPushEvent(); + struct PushStats : public statistics::Group + { + PushStats(PushEngine &push); + + void regStats() override; + + PushEngine &push; + + statistics::Scalar numUpdates; + }; + + PushStats stats; + protected: virtual void respondToAlarm(); virtual bool handleMemResp(PacketPtr pkt); From c17fb8b04a02fdd590aa3ea5df55cedef47b1f18 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 17 May 2022 10:56:09 -0700 Subject: [PATCH 100/247] Fixing double evicts. --- configs/accl/sega.py | 6 +++--- src/accl/graph/sega/coalesce_engine.cc | 27 ++++++++++---------------- src/accl/graph/sega/coalesce_engine.hh | 3 --- 3 files changed, 13 insertions(+), 23 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index bfdad58f72..b799b05dc5 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -122,7 +122,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path): response_latency=1, width=64) - self.ctrl = CenteralController(addr=0, value=0, + self.ctrl = CenteralController(addr=192, value=0, image_file=f"{graph_path}/vertices") self.ctrl.req_port = self.interconnect.cpu_side_ports @@ -130,7 +130,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path): num_mpus, self.cache_line_size, "2GiB", - "2GiB", + "14GiB", graph_path) mpus = [] @@ -158,6 +158,6 @@ def get_inputs(): m5.instantiate() - exit_event = m5.simulate() + exit_event = m5.simulate(1000000000000) print("Simulation finished!") exit() diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index fbe593507a..b41f6b1db7 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -325,22 +325,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) return true; } -PacketPtr -CoalesceEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data) -{ - RequestPtr req = std::make_shared(addr, size, 0, _requestorId); - - // Dummy PC to have PC-based prefetchers latch on; get entropy into higher - // bits - req->setPC(((Addr) _requestorId) << 2); - - PacketPtr pkt = new Packet(req, MemCmd::WriteReq); - pkt->allocate(); - pkt->setData(data); - - return pkt; -} - void CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) { @@ -370,7 +354,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]." " It does not have any taken items anymore.\n", __func__, block_index); - evictQueue.push_back(block_index); + // TODO: Fix this hack + bool found = false; + for (auto i : evictQueue) { + if (i == block_index) { + found = true; + } + } + if (!found) { + evictQueue.push_back(block_index); + } DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n", __func__, block_index, evictQueue.size()); } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index ce019ef969..e86014fc25 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -85,9 +85,6 @@ class CoalesceEngine : public BaseMemEngine std::deque evictQueue; - PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data); - // PacketPtr createWritePacket(Addr addr, unsigned int size, WorkListItem wl); - EventFunctionWrapper nextRespondEvent; void processNextRespondEvent(); From 4c8ebec475ae4473c8819f59cc3c09804613d7bc Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 18 May 2022 17:23:05 -0700 Subject: [PATCH 101/247] Fixing false dependency and deadlock issues. wip. --- src/accl/graph/sega/coalesce_engine.cc | 74 +++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index b41f6b1db7..92d82bce35 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -347,9 +347,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) cacheBlocks[block_index].takenMask &= ~(1 << wl_offset); DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index, cacheBlocks[block_index].items[wl_offset].to_string()); + // TODO: Make this more general and programmable. - // TODO: Later on check (cacheBlocks[block_index].hasConflict) to add - // to evictQueue. if ((cacheBlocks[block_index].takenMask == 0)) { DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]." " It does not have any taken items anymore.\n", @@ -359,6 +358,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) for (auto i : evictQueue) { if (i == block_index) { found = true; + break; } } if (!found) { @@ -376,6 +376,76 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) } +void +CoalesceEngine::processNextApplyEvent() +{ + int block_index = applyQueue.front(); + + if (cacheBlocks[block_index].takenMask) { + DPRINTF(MPU, "%s: cache line [%d] has been taken amid apply process. " + "Therefore, ignoring the apply schedule.\n", + __func__, block_index); + stats.falseApplySchedules++; + } else if (!cacheBlocks[block_index].hasChange) { + DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply " + "needed. Adding the cache line to evict schedule.\n", + __func__, block_index); + evictQueue.push_back(block_index); + } else { + for (int i = 0; i < numElementsPerLine; i++) { + uint32_t old_prop = cacheBlocks[block_index].items[i].prop; + cacheBlocks[block_index].items[i].prop = std::min( + cacheBlocks[block_index].items[i].prop, + cacheBlocks[block_index].items[i].tempProp); + // TODO: Is this correct? + cacheBlocks[block_index].items[i].tempProp = cacheBlocks[block_index].items[i].prop; + + if (cacheBlocks[block_index].items[i].prop != old_prop) { + if (peerPushEngine->recvWLItem( + cacheBlocks[block_index].items[i])) { + DPRINTF(MPU, "%s: Sent WorkListItem [%d] to PushEngine.\n", + __func__, + cacheBlocks[block_index].addr + i * sizeof(WorkListItem)); + } else { + // peerPushEngine->setPushAlarm(); + // pendingPushAlarm = true; + return; + } + } + } + // TODO: This is where eviction policy goes + evictQueue.push_back(block_index); + } + + applyQueue.pop_front(); + + if ((!evictQueue.empty()) && + (!pendingAlarm()) && + (!nextEvictEvent.scheduled())) { + schedule(nextEvictEvent, nextCycle()); + } + + if ((!applyQueue.empty()) && + (!nextApplyEvent.scheduled())) { + schedule(nextApplyEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextEvictEvent() +{ + int block_index = evictQueue.front(); + + if (cacheBlocks[block_index].takenMask) { + DPRINTF(MPU, "%s: cache line [%d] has been taken amid evict process. " + "Therefore, ignoring the apply schedule.\n", + __func__, block_index); + stats.falseEvictSchedules++; + } else { + int space_needed = cacheBlocks + } +} + void CoalesceEngine::processNextApplyAndCommitEvent() { From 7e7f09d79330b2de27c62d3d07e7bf141c20ccd3 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 19 May 2022 12:10:10 -0700 Subject: [PATCH 102/247] Decoupling apply and evict. Done. --- configs/accl/sega.py | 2 +- src/accl/graph/sega/coalesce_engine.cc | 214 ++++++++----------------- src/accl/graph/sega/coalesce_engine.hh | 11 +- 3 files changed, 81 insertions(+), 146 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index b799b05dc5..9d8b449e0f 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -122,7 +122,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path): response_latency=1, width=64) - self.ctrl = CenteralController(addr=192, value=0, + self.ctrl = CenteralController(addr=0, value=0, image_file=f"{graph_path}/vertices") self.ctrl.req_port = self.interconnect.cpu_side_ports diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 92d82bce35..f3402255bc 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -45,7 +45,8 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): numMSHREntry(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), nextRespondEvent([this] { processNextRespondEvent(); }, name()), - nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()), + nextApplyEvent([this] { processNextApplyEvent(); }, name()), + nextEvictEvent([this] { processNextEvictEvent(); }, name()), stats(*this) { assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine)); @@ -237,8 +238,8 @@ CoalesceEngine::processNextRespondEvent() void CoalesceEngine::respondToAlarm() { - assert(!nextApplyAndCommitEvent.scheduled()); - schedule(nextApplyAndCommitEvent, nextCycle()); + assert(pendingAlarm() && (!nextEvictEvent.scheduled())); + schedule(nextEvictEvent, nextCycle()); } bool @@ -362,16 +363,15 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) } } if (!found) { - evictQueue.push_back(block_index); + applyQueue.push_back(block_index); } DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n", __func__, block_index, evictQueue.size()); } - if ((!nextApplyAndCommitEvent.scheduled()) && - (!evictQueue.empty()) && - (!pendingAlarm())) { - schedule(nextApplyAndCommitEvent, nextCycle()); + if ((!applyQueue.empty()) && + (!nextApplyEvent.scheduled())) { + schedule(nextApplyEvent, nextCycle()); } } @@ -442,150 +442,74 @@ CoalesceEngine::processNextEvictEvent() __func__, block_index); stats.falseEvictSchedules++; } else { - int space_needed = cacheBlocks - } -} - -void -CoalesceEngine::processNextApplyAndCommitEvent() -{ - // FIXME: Refactor the line below to work with the new inheritance. - // assert((!alarmRequested) && (spaceRequested == 0)); - int block_index = evictQueue.front(); - uint8_t changedMask = 0; - - DPRINTF(MPU, "%s: Received nextApplyAndCommitEvent for cache line[%d].\n", - __func__, block_index); - DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and " - "then commited.\n", __func__, block_index); - - if (cacheBlocks[block_index].takenMask == 0) { - if ((cacheBlocks[block_index].hasChange) && - (cacheBlocks[block_index].hasConflict) && - (memReqQueueHasSpace(2))) { - DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n", - __func__, block_index); - } else if ((cacheBlocks[block_index].hasChange) && - (!cacheBlocks[block_index].hasConflict) && - (memReqQueueHasSpace(1))) { - DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n", - __func__, block_index); - } else if ((!cacheBlocks[block_index].hasChange) && - (cacheBlocks[block_index].hasConflict) && - (memReqQueueHasSpace(1))) { - DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n", - __func__, block_index); - } else if ((!cacheBlocks[block_index].hasChange) && - (!cacheBlocks[block_index].hasConflict)) { - DPRINTF(MPU, "%s: No ApplyAndCommit needed for cache line[%d].\n", - __func__, block_index); - } else { - int spaceNeeded = cacheBlocks[block_index].hasConflict ? 2 : 1; - requestAlarm(spaceNeeded); - DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set " - "an alarm for nextApplyAndCommitEvent when there is %d space.\n", - __func__, spaceNeeded); + int space_needed = cacheBlocks[block_index].hasChange ? + (cacheBlocks[block_index].hasConflict ? 2 : 1) : + (cacheBlocks[block_index].hasConflict ? 1 : 0); + if (!memReqQueueHasSpace(space_needed)) { + DPRINTF(MPU, "%s: There is not enough space in memReqQueue to " + "procees the eviction of cache line [%d]. hasChange: %d, " + "hasConflict: %d.\n", __func__, block_index, + cacheBlocks[block_index].hasChange, + cacheBlocks[block_index].hasConflict); + requestAlarm(space_needed); return; - } - - // Reducing between tempProp and prop for each item in the cache line. - for (int i = 0; i < numElementsPerLine; i++) { - uint32_t old_prop = cacheBlocks[block_index].items[i].prop; - cacheBlocks[block_index].items[i].prop = std::min( - cacheBlocks[block_index].items[i].prop, - cacheBlocks[block_index].items[i].tempProp); - DPRINTF(MPU, "%s: Applied cache line[%d][%d] = %s.\n", __func__, - block_index, i, - cacheBlocks[block_index].items[i].to_string()); - if (old_prop != cacheBlocks[block_index].items[i].prop) { - changedMask |= (1 << i); - // TODO: Add a stat to count the number of changed props. - DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n", - __func__, block_index, i); + } else { + if (cacheBlocks[block_index].hasChange) { + DPRINTF(MPU, "%s: Change observed on cache line [%d].\n", + __func__, block_index); + PacketPtr write_pkt = createWritePacket( + cacheBlocks[block_index].addr, peerMemoryAtomSize, + (uint8_t*) cacheBlocks[block_index].items); + DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, " + "size = %d.\n", __func__, + write_pkt->getAddr(), write_pkt->getSize()); + enqueueMemReq(write_pkt); } - } - if (cacheBlocks[block_index].hasChange) { - DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n" - , __func__, block_index); - - PacketPtr write_pkt = createWritePacket( - cacheBlocks[block_index].addr, peerMemoryAtomSize, - (uint8_t*) cacheBlocks[block_index].items); - DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = %d.\n", - __func__, write_pkt->getAddr(), peerMemoryAtomSize); - enqueueMemReq(write_pkt); - DPRINTF(MPU, "%s: Added the evicting write back packet to " - "outstandingMemReqQueue.\n" , __func__); - - for (int i = 0; i < numElementsPerLine; i++) { - if ((changedMask & (1 << i)) == (1 << i)) { - DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", - __func__, cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)), - cacheBlocks[block_index].items[i].to_string()); - peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]); - DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n", - __func__, block_index, i); - } - } - } + if (cacheBlocks[block_index].hasConflict) { + assert(!MSHRMap[block_index].empty()); + Addr miss_addr = MSHRMap[block_index].front(); + DPRINTF(MPU, "%s: First conflicting address for cache line[%d]" + " is Addr: %lu.\n", __func__, block_index, miss_addr); - if (cacheBlocks[block_index].hasConflict) { - assert(!MSHRMap[block_index].empty()); - DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is " - "enough space in outstandingMemReqQueue for a read " - "packet.\n", __func__, block_index); - Addr miss_addr = MSHRMap[block_index][0]; - DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is" - " Addr: %lu.\n", __func__, block_index, miss_addr); - - Addr aligned_miss_addr = - std::floor(miss_addr / peerMemoryAtomSize) * + Addr aligned_miss_addr = + std::floor(miss_addr / peerMemoryAtomSize) * peerMemoryAtomSize; - PacketPtr read_pkt = createReadPacket(aligned_miss_addr, - peerMemoryAtomSize); - DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." - " req addr (aligned_addr) = %lu, size = %d.\n", - __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize); - enqueueMemReq(read_pkt); - DPRINTF(MPU, "%s: Added the evicting write back packet along with " - "its subsequent read packet (to service the conflicts)" - " to outstandingMemReqQueue.\n" , __func__); - - cacheBlocks[block_index].addr = aligned_miss_addr; - cacheBlocks[block_index].takenMask = 0; - cacheBlocks[block_index].allocated = true; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].hasConflict = true; - cacheBlocks[block_index].hasChange = false; - } else { - DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is " - "enough space in outstandingMemReqQueue for the write back" - " packet.\n", __func__, block_index); - DPRINTF(MPU, "%s: Added the write back packet to " - "outstandingMemReqQueue.\n", __func__); - - // Since allocated is false, does not matter what the address is. - cacheBlocks[block_index].takenMask = 0; - cacheBlocks[block_index].allocated = false; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].hasConflict = false; - cacheBlocks[block_index].hasChange = false; - } + PacketPtr read_pkt = createReadPacket(aligned_miss_addr, + peerMemoryAtomSize); + DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." + " req addr (aligned_addr) = %lu, size = %d.\n", + __func__, miss_addr, + read_pkt->getAddr(), read_pkt->getSize()); + enqueueMemReq(read_pkt); + + cacheBlocks[block_index].addr = aligned_miss_addr; + cacheBlocks[block_index].takenMask = 0; + cacheBlocks[block_index].allocated = true; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].hasConflict = true; + cacheBlocks[block_index].hasChange = false; + DPRINTF(MPU, "%s: Allocated cache line [%d] for Addr: %lu.\n", + __func__, block_index, aligned_miss_addr); + } else { - } else { - DPRINTF(MPU, "%s: cache line[%d] has been read since being scheduled " - "for eviction. Therefore, ignoring the evict schedule.\n", - __func__, block_index); + // Since allocated is false, does not matter what the address is. + cacheBlocks[block_index].takenMask = 0; + cacheBlocks[block_index].allocated = false; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].hasConflict = false; + cacheBlocks[block_index].hasChange = false; + DPRINTF(MPU, "%s: Deallocated cache line [%d].\n", + __func__, block_index); + } + } } evictQueue.pop_front(); - DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size " - " = %u.\n", __func__, evictQueue.size()); - if ((!nextApplyAndCommitEvent.scheduled()) && - (!evictQueue.empty())) { - schedule(nextApplyAndCommitEvent, nextCycle()); + if ((!evictQueue.empty()) && + (!nextEvictEvent.scheduled())) { + schedule(nextEvictEvent, nextCycle()); } } @@ -604,7 +528,11 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) ADD_STAT(readHitUnderMisses, statistics::units::Count::get(), "Number of cache hit under misses."), ADD_STAT(readRejections, statistics::units::Count::get(), - "Number of cache rejections.") + "Number of cache rejections."), + ADD_STAT(falseApplySchedules, statistics::units::Count::get(), + "Number of failed apply schedules."), + ADD_STAT(falseEvictSchedules, statistics::units::Count::get(), + "Number of failed evict schedules.") { } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index e86014fc25..82b03f53aa 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -83,13 +83,18 @@ class CoalesceEngine : public BaseMemEngine std::deque> responseQueue; + std::deque applyQueue; + std::deque evictQueue; EventFunctionWrapper nextRespondEvent; void processNextRespondEvent(); - EventFunctionWrapper nextApplyAndCommitEvent; - void processNextApplyAndCommitEvent(); + EventFunctionWrapper nextApplyEvent; + void processNextApplyEvent(); + + EventFunctionWrapper nextEvictEvent; + void processNextEvictEvent(); struct CoalesceStats : public statistics::Group { @@ -105,6 +110,8 @@ class CoalesceEngine : public BaseMemEngine statistics::Scalar readMisses; statistics::Scalar readHitUnderMisses; statistics::Scalar readRejections; + statistics::Scalar falseApplySchedules; + statistics::Scalar falseEvictSchedules; }; CoalesceStats stats; From 550a9fed64190cb41db8366425e3b793c8c5ada8 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 19 May 2022 21:20:07 -0700 Subject: [PATCH 103/247] Fixed miss-deallocation bug. Hopefully. --- configs/accl/sega.py | 2 +- src/accl/graph/base/base_mem_engine.cc | 24 +++--- src/accl/graph/base/base_mem_engine.hh | 17 ++-- src/accl/graph/sega/coalesce_engine.cc | 107 +++++++++++++++++-------- src/accl/graph/sega/coalesce_engine.hh | 13 +-- src/accl/graph/sega/push_engine.cc | 26 ++++-- src/accl/graph/sega/push_engine.hh | 11 ++- src/accl/graph/sega/wl_engine.cc | 1 - src/accl/graph/sega/wl_engine.hh | 1 - 9 files changed, 136 insertions(+), 66 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 9d8b449e0f..31b65ae726 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -9,7 +9,7 @@ class MPU(SubSystem): def __init__(self, base_edge_addr): super(MPU, self).__init__() self.push_engine = PushEngine(base_edge_addr=base_edge_addr, - push_req_queue_size=0, + push_req_queue_size=16, attached_memory_atom_size=64) self.coalesce_engine = CoalesceEngine( peer_push_engine=self.push_engine, diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc index 50e64ae7c3..f02f1d2feb 100644 --- a/src/accl/graph/base/base_mem_engine.cc +++ b/src/accl/graph/base/base_mem_engine.cc @@ -37,8 +37,8 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams ¶ms): system(params.system), memPort(name() + ".mem_port", this), outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size), - alarmRequested(false), - spaceRequested(0), + memAlarmRequested(false), + memSpaceRequested(0), nextMemReqEvent([this] { processNextMemReqEvent(); }, name()), _requestorId(system->getRequestorId(this)), peerMemoryAtomSize(params.attached_memory_atom_size) @@ -106,12 +106,12 @@ BaseMemEngine::processNextMemReqEvent() __func__, pkt->getAddr(), pkt->getSize()); outstandingMemReqQueue.pop_front(); - if (alarmRequested && + if (memAlarmRequested && (outstandingMemReqQueue.size() <= - (outstandingMemReqQueueSize - spaceRequested))) { - alarmRequested = false; - spaceRequested = 0; - respondToAlarm(); + (outstandingMemReqQueueSize - memSpaceRequested))) { + memAlarmRequested = false; + memSpaceRequested = 0; + respondToMemAlarm(); } if ((!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) { @@ -151,7 +151,7 @@ BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data) } bool -BaseMemEngine::memReqQueueHasSpace(int space) +BaseMemEngine::allocateMemReqSpace(int space) { assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize); return ( @@ -179,13 +179,13 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt) } void -BaseMemEngine::requestAlarm(int space) { - panic_if((alarmRequested == true) || (spaceRequested != 0), +BaseMemEngine::requestMemAlarm(int space) { + panic_if((memAlarmRequested == true) || (memSpaceRequested != 0), "You should not request another alarm without the first one being" "responded to.\n"); DPRINTF(MPU, "%s: Alarm requested with space = %d.\n", __func__, space); - alarmRequested = true; - spaceRequested = space; + memAlarmRequested = true; + memSpaceRequested = space; } void diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh index fb7cab91b0..8a18807e2e 100644 --- a/src/accl/graph/base/base_mem_engine.hh +++ b/src/accl/graph/base/base_mem_engine.hh @@ -69,8 +69,8 @@ class BaseMemEngine : public ClockedObject MemPort memPort; int outstandingMemReqQueueSize; - bool alarmRequested; - int spaceRequested; + bool memAlarmRequested; + int memSpaceRequested; std::deque outstandingMemReqQueue; EventFunctionWrapper nextMemReqEvent; @@ -81,15 +81,16 @@ class BaseMemEngine : public ClockedObject size_t peerMemoryAtomSize; - void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } - - bool memReqQueueHasSpace(int space); + bool allocateMemReqSpace(int space); bool memReqQueueFull(); + + bool pendingMemAlarm() { return memAlarmRequested; } + void requestMemAlarm(int space); + + void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } void enqueueMemReq(PacketPtr pkt); - bool pendingAlarm() { return alarmRequested; } - void requestAlarm(int space); - virtual void respondToAlarm() = 0; + virtual void respondToMemAlarm() = 0; virtual bool handleMemResp(PacketPtr pkt) = 0; PacketPtr createReadPacket(Addr addr, unsigned int size); diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index f3402255bc..36faff2c6a 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -44,6 +44,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), numMSHREntry(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), + pendingPushAlarm(false), nextRespondEvent([this] { processNextRespondEvent(); }, name()), nextApplyEvent([this] { processNextApplyEvent(); }, name()), nextEvictEvent([this] { processNextEvictEvent(); }, name()), @@ -54,6 +55,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): for (int i = 0; i < numLines; i++) { cacheBlocks[i] = Block(numElementsPerLine); } + peerPushEngine->registerCoalesceEngine(this); } void @@ -91,10 +93,11 @@ CoalesceEngine::recvReadAddr(Addr addr) cacheBlocks[block_index].items[wl_offset])); DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s " "to responseQueue. responseQueue.size = %d.\n", - __func__, addr, block_index, wl_offset, responseQueue.size(), - cacheBlocks[block_index].items[wl_offset].to_string()); + __func__, addr, block_index, wl_offset, + cacheBlocks[block_index].items[wl_offset].to_string(), + responseQueue.size()); // TODO: Add a stat to count the number of WLItems that have been touched. - cacheBlocks[block_index].takenMask |= (1 << wl_offset); + cacheBlocks[block_index].busyMask |= (1 << wl_offset); stats.readHits++; assert(!responseQueue.empty()); @@ -156,7 +159,7 @@ CoalesceEngine::recvReadAddr(Addr addr) return false; } cacheBlocks[block_index].addr = aligned_addr; - cacheBlocks[block_index].takenMask = 0; + cacheBlocks[block_index].busyMask = 0; cacheBlocks[block_index].allocated = true; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = false; @@ -236,9 +239,9 @@ CoalesceEngine::processNextRespondEvent() } void -CoalesceEngine::respondToAlarm() +CoalesceEngine::respondToMemAlarm() { - assert(pendingAlarm() && (!nextEvictEvent.scheduled())); + assert(pendingMemAlarm() && (!nextEvictEvent.scheduled())); schedule(nextEvictEvent, nextCycle()); } @@ -290,7 +293,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) , __func__, block_index, wl_offset, responseQueue.size()); // TODO: Add a stat to count the number of WLItems that have been touched. - cacheBlocks[block_index].takenMask |= (1 << wl_offset); + cacheBlocks[block_index].busyMask |= (1 << wl_offset); // End of the said block servicedIndices.push_back(i); @@ -336,27 +339,27 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) DPRINTF(MPU, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n", __func__, wl.to_string(), addr); - assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) == + assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) == (1 << wl_offset)); if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) { - cacheBlocks[block_index].hasChange = true; + cacheBlocks[block_index].dirty = true; stats.numVertexWrites++; } cacheBlocks[block_index].items[wl_offset] = wl; - cacheBlocks[block_index].takenMask &= ~(1 << wl_offset); + cacheBlocks[block_index].busyMask &= ~(1 << wl_offset); DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index, cacheBlocks[block_index].items[wl_offset].to_string()); // TODO: Make this more general and programmable. - if ((cacheBlocks[block_index].takenMask == 0)) { + if ((cacheBlocks[block_index].busyMask == 0)) { DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]." " It does not have any taken items anymore.\n", __func__, block_index); // TODO: Fix this hack bool found = false; - for (auto i : evictQueue) { + for (auto i : applyQueue) { if (i == block_index) { found = true; break; @@ -364,12 +367,13 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) } if (!found) { applyQueue.push_back(block_index); + DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n", + __func__, block_index, applyQueue.size()); } - DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n", - __func__, block_index, evictQueue.size()); } if ((!applyQueue.empty()) && + (!pendingPushAlarm) && (!nextApplyEvent.scheduled())) { schedule(nextApplyEvent, nextCycle()); } @@ -381,16 +385,27 @@ CoalesceEngine::processNextApplyEvent() { int block_index = applyQueue.front(); - if (cacheBlocks[block_index].takenMask) { + if (cacheBlocks[block_index].busyMask) { DPRINTF(MPU, "%s: cache line [%d] has been taken amid apply process. " "Therefore, ignoring the apply schedule.\n", __func__, block_index); stats.falseApplySchedules++; - } else if (!cacheBlocks[block_index].hasChange) { + } else if (!cacheBlocks[block_index].dirty) { DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply " "needed. Adding the cache line to evict schedule.\n", __func__, block_index); - evictQueue.push_back(block_index); + bool found = false; + for (auto i : evictQueue) { + if (i == block_index) { + found = true; + break; + } + } + if (!found) { + evictQueue.push_back(block_index); + DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n", + __func__, block_index, evictQueue.size()); + } } else { for (int i = 0; i < numElementsPerLine; i++) { uint32_t old_prop = cacheBlocks[block_index].items[i].prop; @@ -407,20 +422,32 @@ CoalesceEngine::processNextApplyEvent() __func__, cacheBlocks[block_index].addr + i * sizeof(WorkListItem)); } else { - // peerPushEngine->setPushAlarm(); - // pendingPushAlarm = true; + peerPushEngine->setPushAlarm(); + pendingPushAlarm = true; return; } } } // TODO: This is where eviction policy goes - evictQueue.push_back(block_index); + // TODO: Fix this hack. + bool found = false; + for (auto i : evictQueue) { + if (i == block_index) { + found = true; + break; + } + } + if (!found) { + evictQueue.push_back(block_index); + DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n", + __func__, block_index, evictQueue.size()); + } } applyQueue.pop_front(); if ((!evictQueue.empty()) && - (!pendingAlarm()) && + (!pendingMemAlarm()) && (!nextEvictEvent.scheduled())) { schedule(nextEvictEvent, nextCycle()); } @@ -436,25 +463,33 @@ CoalesceEngine::processNextEvictEvent() { int block_index = evictQueue.front(); - if (cacheBlocks[block_index].takenMask) { + bool found_in_apply_queue = false; + for (auto i : applyQueue) { + if (i == block_index) { + found_in_apply_queue = true; + break; + } + } + if ((cacheBlocks[block_index].busyMask) || + (found_in_apply_queue)) { DPRINTF(MPU, "%s: cache line [%d] has been taken amid evict process. " "Therefore, ignoring the apply schedule.\n", __func__, block_index); stats.falseEvictSchedules++; } else { - int space_needed = cacheBlocks[block_index].hasChange ? + int space_needed = cacheBlocks[block_index].dirty ? (cacheBlocks[block_index].hasConflict ? 2 : 1) : (cacheBlocks[block_index].hasConflict ? 1 : 0); - if (!memReqQueueHasSpace(space_needed)) { + if (!allocateMemReqSpace(space_needed)) { DPRINTF(MPU, "%s: There is not enough space in memReqQueue to " - "procees the eviction of cache line [%d]. hasChange: %d, " + "procees the eviction of cache line [%d]. dirty: %d, " "hasConflict: %d.\n", __func__, block_index, - cacheBlocks[block_index].hasChange, + cacheBlocks[block_index].dirty, cacheBlocks[block_index].hasConflict); - requestAlarm(space_needed); + requestMemAlarm(space_needed); return; } else { - if (cacheBlocks[block_index].hasChange) { + if (cacheBlocks[block_index].dirty) { DPRINTF(MPU, "%s: Change observed on cache line [%d].\n", __func__, block_index); PacketPtr write_pkt = createWritePacket( @@ -484,21 +519,21 @@ CoalesceEngine::processNextEvictEvent() enqueueMemReq(read_pkt); cacheBlocks[block_index].addr = aligned_miss_addr; - cacheBlocks[block_index].takenMask = 0; + cacheBlocks[block_index].busyMask = 0; cacheBlocks[block_index].allocated = true; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = true; - cacheBlocks[block_index].hasChange = false; + cacheBlocks[block_index].dirty = false; DPRINTF(MPU, "%s: Allocated cache line [%d] for Addr: %lu.\n", __func__, block_index, aligned_miss_addr); } else { // Since allocated is false, does not matter what the address is. - cacheBlocks[block_index].takenMask = 0; + cacheBlocks[block_index].busyMask = 0; cacheBlocks[block_index].allocated = false; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = false; - cacheBlocks[block_index].hasChange = false; + cacheBlocks[block_index].dirty = false; DPRINTF(MPU, "%s: Deallocated cache line [%d].\n", __func__, block_index); } @@ -513,6 +548,14 @@ CoalesceEngine::processNextEvictEvent() } } +void +CoalesceEngine::respondToPushAlarm() +{ + assert(pendingPushAlarm && (!nextApplyEvent.scheduled())); + pendingPushAlarm = false; + schedule(nextApplyEvent, nextCycle()); +} + CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) : statistics::Group(&_coalesce), coalesce(_coalesce), diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 82b03f53aa..824faef10d 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -49,21 +49,21 @@ class CoalesceEngine : public BaseMemEngine { WorkListItem* items; Addr addr; - uint8_t takenMask; + uint8_t busyMask; bool allocated; bool valid; bool hasConflict; - bool hasChange; + bool dirty; // TODO: This might be useful in the future // Tick lastWLWriteTick; Block() {} Block(int num_elements): addr(0), - takenMask(0), + busyMask(0), allocated(false), valid(false), hasConflict(false), - hasChange(false) + dirty(false) { items = new WorkListItem [num_elements]; } @@ -83,6 +83,7 @@ class CoalesceEngine : public BaseMemEngine std::deque> responseQueue; + bool pendingPushAlarm; std::deque applyQueue; std::deque evictQueue; @@ -117,7 +118,7 @@ class CoalesceEngine : public BaseMemEngine CoalesceStats stats; protected: - virtual void respondToAlarm(); + virtual void respondToMemAlarm(); virtual bool handleMemResp(PacketPtr pkt); public: @@ -131,6 +132,8 @@ class CoalesceEngine : public BaseMemEngine void recvWLWrite(Addr addr, WorkListItem wl); void registerWLEngine(WLEngine* wl_engine); + + void respondToPushAlarm(); }; } diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 69b9f3f23e..d5563cca7c 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -28,6 +28,7 @@ #include "accl/graph/sega/push_engine.hh" +#include "accl/graph/sega/coalesce_engine.hh" #include "debug/MPU.hh" #include "mem/packet_access.hh" @@ -36,6 +37,7 @@ namespace gem5 PushEngine::PushEngine(const PushEngineParams ¶ms): BaseMemEngine(params), + pushAlarmSet(false), reqPort(name() + ".req_port", this), baseEdgeAddr(params.base_edge_addr), pushReqQueueSize(params.push_req_queue_size), @@ -56,6 +58,12 @@ PushEngine::getPort(const std::string &if_name, PortID idx) } } +void +PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine) +{ + peerCoalesceEngine = coalesce_engine; +} + void PushEngine::ReqPort::sendPacket(PacketPtr pkt) { @@ -146,11 +154,15 @@ PushEngine::processNextAddrGenEvent() DPRINTF(MPU, "%s: Popped curr_info from pushReqQueue. " "pushReqQueue.size() = %u.\n", __func__, pushReqQueue.size()); + if (pushAlarmSet && (pushReqQueue.size() == pushReqQueueSize - 1)) { + pushAlarmSet = false; + peerCoalesceEngine->respondToPushAlarm(); + } } if (memReqQueueFull()) { if (!pushReqQueue.empty()) { - requestAlarm(1); + requestMemAlarm(1); } return; } @@ -161,7 +173,7 @@ PushEngine::processNextAddrGenEvent() } void -PushEngine::respondToAlarm() +PushEngine::respondToMemAlarm() { assert(!nextAddrGenEvent.scheduled()); schedule(nextAddrGenEvent, nextCycle()); @@ -200,9 +212,6 @@ PushEngine::processNextPushEvent() // TODO: Implement propagate function here uint32_t update_value = value + 1; - DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n", - __func__, curr_edge->neighbor, update_value); - PacketPtr update = createUpdatePacket( curr_edge->neighbor, update_value); @@ -249,6 +258,13 @@ PushEngine::createUpdatePacket(Addr addr, T value) return pkt; } +void +PushEngine::setPushAlarm() +{ + assert(!pushAlarmSet); + pushAlarmSet = true; +} + PushEngine::PushStats::PushStats(PushEngine &_push) : statistics::Group(&_push), push(_push), diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 7a6981daa0..ce24f862ba 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -36,6 +36,8 @@ namespace gem5 { +class CoalesceEngine; + class PushEngine : public BaseMemEngine { private: @@ -95,6 +97,9 @@ class PushEngine : public BaseMemEngine virtual void recvReqRetry(); }; + bool pushAlarmSet; + CoalesceEngine* peerCoalesceEngine; + ReqPort reqPort; Addr baseEdgeAddr; @@ -134,7 +139,7 @@ class PushEngine : public BaseMemEngine PushStats stats; protected: - virtual void respondToAlarm(); + virtual void respondToMemAlarm(); virtual bool handleMemResp(PacketPtr pkt); public: @@ -145,6 +150,10 @@ class PushEngine : public BaseMemEngine PortID idx=InvalidPortID) override; bool recvWLItem(WorkListItem wl); + + void registerCoalesceEngine(CoalesceEngine* coalesce_engine); + + void setPushAlarm(); }; } diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index e949cbcf5b..75ac4f784e 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -37,7 +37,6 @@ namespace gem5 WLEngine::WLEngine(const WLEngineParams ¶ms): BaseReduceEngine(params), respPort(name() + ".resp_port", this), - blockedByCoalescer(false), coalesceEngine(params.coalesce_engine), updateQueueSize(params.update_queue_size), onTheFlyUpdateMapSize(params.on_the_fly_update_map_size), diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 597fdb2b1e..27fc3efa7a 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -69,7 +69,6 @@ class WLEngine : public BaseReduceEngine RespPort respPort; - bool blockedByCoalescer; CoalesceEngine* coalesceEngine; int updateQueueSize; From 929aab118886fde9e286876fd2dc997be0a8684c Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 22 May 2022 14:15:30 -0700 Subject: [PATCH 104/247] Correctness passed with finite push queue and facebook graph. --- configs/accl/sega.py | 2 +- src/accl/graph/sega/coalesce_engine.cc | 20 +++++++++++++------- src/accl/graph/sega/push_engine.cc | 13 ++++++------- src/accl/graph/sega/push_engine.hh | 3 ++- 4 files changed, 22 insertions(+), 16 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 31b65ae726..8a6ac783c3 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -158,6 +158,6 @@ def get_inputs(): m5.instantiate() - exit_event = m5.simulate(1000000000000) + exit_event = m5.simulate() print("Simulation finished!") exit() diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 36faff2c6a..39144972df 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -349,7 +349,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) cacheBlocks[block_index].items[wl_offset] = wl; cacheBlocks[block_index].busyMask &= ~(1 << wl_offset); - DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index, + DPRINTF(MPU, "%s: Wrote to cache line[%d][%d] = %s.\n", + __func__, block_index, wl_offset, cacheBlocks[block_index].items[wl_offset].to_string()); // TODO: Make this more general and programmable. @@ -409,15 +410,20 @@ CoalesceEngine::processNextApplyEvent() } else { for (int i = 0; i < numElementsPerLine; i++) { uint32_t old_prop = cacheBlocks[block_index].items[i].prop; - cacheBlocks[block_index].items[i].prop = std::min( + uint32_t new_prop = std::min( cacheBlocks[block_index].items[i].prop, cacheBlocks[block_index].items[i].tempProp); - // TODO: Is this correct? - cacheBlocks[block_index].items[i].tempProp = cacheBlocks[block_index].items[i].prop; - if (cacheBlocks[block_index].items[i].prop != old_prop) { - if (peerPushEngine->recvWLItem( - cacheBlocks[block_index].items[i])) { + if (new_prop != old_prop) { + if (peerPushEngine->allocatePushSpace()) { + cacheBlocks[block_index].items[i].tempProp = new_prop; + cacheBlocks[block_index].items[i].prop = new_prop; + DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n", + __func__, + cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)), + cacheBlocks[block_index].items[i].to_string()); + peerPushEngine->recvWLItem( + cacheBlocks[block_index].items[i]); DPRINTF(MPU, "%s: Sent WorkListItem [%d] to PushEngine.\n", __func__, cacheBlocks[block_index].addr + i * sizeof(WorkListItem)); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index d5563cca7c..8cfe3c72cc 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -97,7 +97,7 @@ PushEngine::ReqPort::recvReqRetry() } } -bool +void PushEngine::recvWLItem(WorkListItem wl) { // If there are no outdoing edges, no need to generate and push @@ -105,14 +105,14 @@ PushEngine::recvWLItem(WorkListItem wl) if (wl.degree == 0) { DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n", __func__, wl.to_string()); - return true; + return; } assert((pushReqQueueSize == 0) || - (pushReqQueue.size() <= pushReqQueueSize)); - if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) { - return false; - } + (pushReqQueue.size() < pushReqQueueSize)); + panic_if(pushReqQueue.size() == pushReqQueueSize, "You should call this " + "method after checking if there is enough push space. Use " + "allocatePushSpace.\n"); Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge)); Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); @@ -125,7 +125,6 @@ PushEngine::recvWLItem(WorkListItem wl) (!memReqQueueFull())) { schedule(nextAddrGenEvent, nextCycle()); } - return true; } void diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index ce24f862ba..ae465f6eb1 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -149,7 +149,8 @@ class PushEngine : public BaseMemEngine Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; - bool recvWLItem(WorkListItem wl); + bool allocatePushSpace() { return pushReqQueue.size() < pushReqQueueSize; } + void recvWLItem(WorkListItem wl); void registerCoalesceEngine(CoalesceEngine* coalesce_engine); From e16c0deadb328f6496d9f424a21cd3677a5ce542 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 22 May 2022 17:49:06 -0700 Subject: [PATCH 105/247] Fixing an incorrect assertion. --- configs/accl/sega.py | 23 +++++++++++++++++------ src/accl/graph/sega/coalesce_engine.cc | 2 +- src/accl/graph/sega/push_engine.cc | 1 - 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 8a6ac783c3..11e2cfb6af 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -9,7 +9,7 @@ class MPU(SubSystem): def __init__(self, base_edge_addr): super(MPU, self).__init__() self.push_engine = PushEngine(base_edge_addr=base_edge_addr, - push_req_queue_size=16, + push_req_queue_size=64, attached_memory_atom_size=64) self.coalesce_engine = CoalesceEngine( peer_push_engine=self.push_engine, @@ -109,7 +109,12 @@ def setEdgePort(self, port, i): self.edge_mem_ctrl[i].port = port class SEGA(System): - def __init__(self, num_mpus, vertex_cache_line_size, graph_path): + def __init__(self, + num_mpus, + vertex_cache_line_size, + graph_path, + first_addr, + first_value): super(SEGA, self).__init__() self.clk_domain = SrcClockDomain() self.clk_domain.clock = '1GHz' @@ -122,7 +127,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path): response_latency=1, width=64) - self.ctrl = CenteralController(addr=0, value=0, + self.ctrl = CenteralController(addr=first_addr, value=first_value, image_file=f"{graph_path}/vertices") self.ctrl.req_port = self.interconnect.cpu_side_ports @@ -147,13 +152,19 @@ def get_inputs(): argparser.add_argument("num_mpus", type=int) argparser.add_argument("vertex_cache_line_size", type=int) argparser.add_argument("graph_path", type=str) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) args = argparser.parse_args() - return args.num_mpus, args.vertex_cache_line_size, args.graph_path + return args.num_mpus, args.vertex_cache_line_size, \ + args.graph_path, args.init_addr, args.init_value if __name__ == "__m5_main__": - num_mpus, vertex_cache_line_size, graph_path = get_inputs() + num_mpus, vertex_cache_line_size, \ + graph_path, first_addr, first_value = get_inputs() + print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}") - system = SEGA(num_mpus, vertex_cache_line_size, graph_path) + system = SEGA(num_mpus, vertex_cache_line_size, \ + graph_path, first_addr, first_value) root = Root(full_system = False, system = system) m5.instantiate() diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 39144972df..dd651f9e5a 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -241,7 +241,7 @@ CoalesceEngine::processNextRespondEvent() void CoalesceEngine::respondToMemAlarm() { - assert(pendingMemAlarm() && (!nextEvictEvent.scheduled())); + assert(!nextEvictEvent.scheduled()); schedule(nextEvictEvent, nextCycle()); } diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 8cfe3c72cc..ed23fb4d4b 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -176,7 +176,6 @@ PushEngine::respondToMemAlarm() { assert(!nextAddrGenEvent.scheduled()); schedule(nextAddrGenEvent, nextCycle()); - DPRINTF(MPU, "%s: Responded to an alarm.\n", __func__); } bool From 83af4b3b2720bdb7d0ab3b836c4f0c2516b1a950 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 3 Jun 2022 07:44:25 -0700 Subject: [PATCH 106/247] Converting apply and evict queues to FIFOSet. --- src/accl/graph/base/data_structs.hh | 50 +++++++++++++++++++ src/accl/graph/sega/coalesce_engine.cc | 68 +++++++++----------------- src/accl/graph/sega/coalesce_engine.hh | 4 +- src/accl/graph/sega/push_engine.hh | 3 +- 4 files changed, 76 insertions(+), 49 deletions(-) diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index 7535d4bbac..e03686a7e9 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -32,6 +32,9 @@ #include "base/cprintf.hh" #include "base/intmath.hh" +#include +#include + namespace gem5 { @@ -83,6 +86,53 @@ struct __attribute__ ((packed)) Edge }; static_assert(isPowerOf2(sizeof(WorkListItem))); +static_assert(isPowerOf2(sizeof(Edge))); + +template +class FIFOSet +{ + private: + std::queue fifo; + std::unordered_set set; + + public: + FIFOSet(int cap) + { + set.reserve(cap); + } + + void push_back(T item) + { + if (set.find(item) == set.end()) { + set.insert(item); + fifo.push(item); + } + } + + void pop_front() + { + T front = fifo.front(); + set.erase(front); + fifo.pop(); + } + + T& front() + { + return fifo.front(); + } + + size_t size() { + return fifo.size(); + } + + bool empty() { + return fifo.empty(); + } + + bool find(T item) { + return (set.find(item) != set.end()); + } +}; } diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index dd651f9e5a..f96adbf8d8 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -45,6 +45,8 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): numMSHREntry(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), pendingPushAlarm(false), + applyQueue(numLines), + evictQueue(numLines), nextRespondEvent([this] { processNextRespondEvent(); }, name()), nextApplyEvent([this] { processNextApplyEvent(); }, name()), nextEvictEvent([this] { processNextEvictEvent(); }, name()), @@ -55,6 +57,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): for (int i = 0; i < numLines; i++) { cacheBlocks[i] = Block(numElementsPerLine); } + peerPushEngine->registerCoalesceEngine(this); } @@ -141,14 +144,18 @@ CoalesceEngine::recvReadAddr(Addr addr) "line[%d]", __func__, addr, block_index); stats.readMisses++; stats.numVertexReads++; + if (!cacheBlocks[block_index].busyMask) { + applyQueue.push_back(block_index); + assert(!applyQueue.empty()); + if ((!nextApplyEvent.scheduled()) && + (!pendingPushAlarm)) { + schedule(nextApplyEvent, nextCycle()); + } + } return true; } else { assert(!cacheBlocks[block_index].valid); // MSHR available and no conflict - //TODO: Fix this to work with new inheritance. - // assert( - // outstandingMemReqQueue.size() <= - // outstandingMemReqQueueSize); DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to " "allocate a cache line for it.\n", __func__, addr); @@ -278,8 +285,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) std::vector servicedIndices; for (int i = 0; i < MSHRMap[block_index].size(); i++) { Addr miss_addr = MSHRMap[block_index][i]; - Addr aligned_miss_addr = std::floor(miss_addr / peerMemoryAtomSize) * peerMemoryAtomSize; - + Addr aligned_miss_addr = roundDown(miss_addr, peerMemoryAtomSize); if (aligned_miss_addr == addr) { int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could " @@ -333,7 +339,7 @@ void CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) { // TODO: Parameterize all the numbers here. - Addr aligned_addr = std::floor(addr / peerMemoryAtomSize) * peerMemoryAtomSize; + Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); int block_index = (aligned_addr / peerMemoryAtomSize) % numLines; int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); @@ -359,18 +365,9 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) " It does not have any taken items anymore.\n", __func__, block_index); // TODO: Fix this hack - bool found = false; - for (auto i : applyQueue) { - if (i == block_index) { - found = true; - break; - } - } - if (!found) { - applyQueue.push_back(block_index); - DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n", - __func__, block_index, applyQueue.size()); - } + applyQueue.push_back(block_index); + DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n", + __func__, block_index, applyQueue.size()); } if ((!applyQueue.empty()) && @@ -395,15 +392,9 @@ CoalesceEngine::processNextApplyEvent() DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply " "needed. Adding the cache line to evict schedule.\n", __func__, block_index); - bool found = false; - for (auto i : evictQueue) { - if (i == block_index) { - found = true; - break; - } - } - if (!found) { + if (cacheBlocks[block_index].hasConflict) { evictQueue.push_back(block_index); + assert(!evictQueue.empty()); DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n", __func__, block_index, evictQueue.size()); } @@ -435,15 +426,7 @@ CoalesceEngine::processNextApplyEvent() } } // TODO: This is where eviction policy goes - // TODO: Fix this hack. - bool found = false; - for (auto i : evictQueue) { - if (i == block_index) { - found = true; - break; - } - } - if (!found) { + if (cacheBlocks[block_index].hasConflict){ evictQueue.push_back(block_index); DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n", __func__, block_index, evictQueue.size()); @@ -469,15 +452,8 @@ CoalesceEngine::processNextEvictEvent() { int block_index = evictQueue.front(); - bool found_in_apply_queue = false; - for (auto i : applyQueue) { - if (i == block_index) { - found_in_apply_queue = true; - break; - } - } if ((cacheBlocks[block_index].busyMask) || - (found_in_apply_queue)) { + (applyQueue.find(block_index))) { DPRINTF(MPU, "%s: cache line [%d] has been taken amid evict process. " "Therefore, ignoring the apply schedule.\n", __func__, block_index); @@ -514,8 +490,8 @@ CoalesceEngine::processNextEvictEvent() " is Addr: %lu.\n", __func__, block_index, miss_addr); Addr aligned_miss_addr = - std::floor(miss_addr / peerMemoryAtomSize) * - peerMemoryAtomSize; + roundDown(miss_addr, peerMemoryAtomSize); + PacketPtr read_pkt = createReadPacket(aligned_miss_addr, peerMemoryAtomSize); DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 824faef10d..177bb067ab 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -84,9 +84,9 @@ class CoalesceEngine : public BaseMemEngine std::deque> responseQueue; bool pendingPushAlarm; - std::deque applyQueue; + FIFOSet applyQueue; - std::deque evictQueue; + FIFOSet evictQueue; EventFunctionWrapper nextRespondEvent; void processNextRespondEvent(); diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index ae465f6eb1..c93b3b386d 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -31,6 +31,7 @@ #include "accl/graph/base/base_mem_engine.hh" #include "accl/graph/base/data_structs.hh" +#include "base/intmath.hh" #include "params/PushEngine.hh" namespace gem5 @@ -59,7 +60,7 @@ class PushEngine : public BaseMemEngine std::tuple nextReadPacketInfo() { panic_if(done(), "Should not call nextPacketInfo when done.\n"); - Addr aligned_addr = std::floor(_start / _atom) * _atom; + Addr aligned_addr = roundDown(_start, _atom); Addr offset = _start - aligned_addr; int num_items = 0; From e9c4b2e982425c29d348780c5d819a8b7893f377 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 13 Jun 2022 14:48:49 -0700 Subject: [PATCH 107/247] Moving delete pkt in push_engine.cc. --- src/accl/graph/sega/push_engine.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index ed23fb4d4b..cb71b73c60 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -228,8 +228,8 @@ PushEngine::processNextPushEvent() reqOffsetMap.erase(pkt->req); reqNumEdgeMap.erase(pkt->req); reqValueMap.erase(pkt->req); - delete pkt; memRespQueue.pop_front(); + delete pkt; } if (!nextPushEvent.scheduled() && !memRespQueue.empty()) { From a07fba27ea6d0869853fe4db500680e4c62aeb9f Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 19 Jun 2022 14:29:57 -0700 Subject: [PATCH 108/247] Enforced limited length on memRespQueue in PushEngine. --- configs/accl/sega.py | 15 +++++--- src/accl/graph/SConscript | 3 +- src/accl/graph/base/BaseMemEngine.py | 2 ++ src/accl/graph/base/base_mem_engine.cc | 49 +++++++++++++++++--------- src/accl/graph/base/base_mem_engine.hh | 4 +++ src/accl/graph/sega/coalesce_engine.cc | 5 ++- src/accl/graph/sega/coalesce_engine.hh | 1 + src/accl/graph/sega/push_engine.cc | 5 +++ src/accl/graph/sega/push_engine.hh | 1 + src/accl/graph/sega/wl_engine.cc | 12 +++++-- 10 files changed, 72 insertions(+), 25 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 11e2cfb6af..a5dd759f1f 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -9,16 +9,21 @@ class MPU(SubSystem): def __init__(self, base_edge_addr): super(MPU, self).__init__() self.push_engine = PushEngine(base_edge_addr=base_edge_addr, - push_req_queue_size=64, - attached_memory_atom_size=64) + push_req_queue_size=1, + attached_memory_atom_size=64, + outstanding_mem_req_queue_size=1, + resp_queue_size=1) self.coalesce_engine = CoalesceEngine( peer_push_engine=self.push_engine, attached_memory_atom_size=32, cache_size="1MiB", - num_mshr_entry=16) + num_mshr_entry=1, + num_tgts_per_mshr=1, + outstanding_mem_req_queue_size=1, + resp_queue_size=1) self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, - update_queue_size=32, - on_the_fly_update_map_size=8) + update_queue_size=1, + on_the_fly_update_map_size=1) def getRespPort(self): return self.wl_engine.resp_port diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript index 00fa2466dd..9663d3f263 100644 --- a/src/accl/graph/SConscript +++ b/src/accl/graph/SConscript @@ -27,4 +27,5 @@ Import('*') -DebugFlag('MPU') \ No newline at end of file +DebugFlag('MPU') +DebugFlag('SEGAQSize') diff --git a/src/accl/graph/base/BaseMemEngine.py b/src/accl/graph/base/BaseMemEngine.py index 69f68e9dfc..2ecb6659d8 100644 --- a/src/accl/graph/base/BaseMemEngine.py +++ b/src/accl/graph/base/BaseMemEngine.py @@ -43,3 +43,5 @@ class BaseMemEngine(ClockedObject): attached_memory_atom_size = Param.Int(64, "The atom size of the attached " "memory.") + + resp_queue_size = Param.Int(64, "blah") diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc index f02f1d2feb..112b0d63cb 100644 --- a/src/accl/graph/base/base_mem_engine.cc +++ b/src/accl/graph/base/base_mem_engine.cc @@ -29,6 +29,8 @@ #include "accl/graph/base/base_mem_engine.hh" #include "debug/MPU.hh" +#include "debug/SEGAQSize.hh" + namespace gem5 { @@ -37,6 +39,8 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams ¶ms): system(params.system), memPort(name() + ".mem_port", this), outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size), + onTheFlyReqs(0), + respQueueSize(params.resp_queue_size), memAlarmRequested(false), memSpaceRequested(0), nextMemReqEvent([this] { processNextMemReqEvent(); }, name()), @@ -73,7 +77,7 @@ bool BaseMemEngine::MemPort::recvTimingResp(PacketPtr pkt) { //TODO: Investigate sending true all the time - return owner->handleMemResp(pkt); + return owner->recvTimingResp(pkt); } void @@ -98,20 +102,25 @@ BaseMemEngine::processNextMemReqEvent() return; } - // TODO: Maybe add a DPRINTF here. - PacketPtr pkt = outstandingMemReqQueue.front(); - memPort.sendPacket(pkt); - DPRINTF(MPU, "%s: Sent a packet to memory with the following info. " - "pkt->addr: %lu, pkt->size: %lu.\n", - __func__, pkt->getAddr(), pkt->getSize()); - outstandingMemReqQueue.pop_front(); - - if (memAlarmRequested && - (outstandingMemReqQueue.size() <= - (outstandingMemReqQueueSize - memSpaceRequested))) { - memAlarmRequested = false; - memSpaceRequested = 0; - respondToMemAlarm(); + if ((respBuffSize() == -1) || + ((respBuffSize() + onTheFlyReqs) < respQueueSize)) { + PacketPtr pkt = outstandingMemReqQueue.front(); + memPort.sendPacket(pkt); + onTheFlyReqs++; + DPRINTF(MPU, "%s: Sent a packet to memory with the following info. " + "pkt->addr: %lu, pkt->size: %lu.\n", + __func__, pkt->getAddr(), pkt->getSize()); + outstandingMemReqQueue.pop_front(); + DPRINTF(SEGAQSize, "%s: outstandingMemReqQueue.size: %lu.\n", + __func__, outstandingMemReqQueue.size()); + + if (memAlarmRequested && + (outstandingMemReqQueue.size() <= + (outstandingMemReqQueueSize - memSpaceRequested))) { + memAlarmRequested = false; + memSpaceRequested = 0; + respondToMemAlarm(); + } } if ((!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) { @@ -171,7 +180,8 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt) { panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n"); outstandingMemReqQueue.push_back(pkt); - + DPRINTF(SEGAQSize, "%s: outstandingMemReqQueue.size: %lu.\n", + __func__, outstandingMemReqQueue.size()); assert(!outstandingMemReqQueue.empty()); if (!nextMemReqEvent.scheduled()) { schedule(nextMemReqEvent, nextCycle()); @@ -197,4 +207,11 @@ BaseMemEngine::wakeUp() } } +bool +BaseMemEngine::recvTimingResp(PacketPtr pkt) +{ + onTheFlyReqs--; + return handleMemResp(pkt); +} + } diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh index 8a18807e2e..fc67f3f6d8 100644 --- a/src/accl/graph/base/base_mem_engine.hh +++ b/src/accl/graph/base/base_mem_engine.hh @@ -69,6 +69,8 @@ class BaseMemEngine : public ClockedObject MemPort memPort; int outstandingMemReqQueueSize; + int onTheFlyReqs; + int respQueueSize; bool memAlarmRequested; int memSpaceRequested; std::deque outstandingMemReqQueue; @@ -90,6 +92,7 @@ class BaseMemEngine : public ClockedObject void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } void enqueueMemReq(PacketPtr pkt); + virtual int respBuffSize() = 0; virtual void respondToMemAlarm() = 0; virtual bool handleMemResp(PacketPtr pkt) = 0; @@ -109,6 +112,7 @@ class BaseMemEngine : public ClockedObject AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); } + bool recvTimingResp(PacketPtr pkt); void recvFunctional(PacketPtr pkt); void wakeUp(); diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index f96adbf8d8..ee1e3f85ff 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -141,11 +141,14 @@ CoalesceEngine::recvReadAddr(Addr addr) cacheBlocks[block_index].hasConflict = true; MSHRMap[block_index].push_back(addr); DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache " - "line[%d]", __func__, addr, block_index); + "line[%d].\n", __func__, addr, block_index); stats.readMisses++; stats.numVertexReads++; if (!cacheBlocks[block_index].busyMask) { applyQueue.push_back(block_index); + DPRINTF(MPU, "%s: Added %d to applyQueue. " + "applyQueue.size = %u.\n", __func__, + block_index, applyQueue.size()); assert(!applyQueue.empty()); if ((!nextApplyEvent.scheduled()) && (!pendingPushAlarm)) { diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 177bb067ab..1e353c11b8 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -118,6 +118,7 @@ class CoalesceEngine : public BaseMemEngine CoalesceStats stats; protected: + virtual int respBuffSize() { return -1; } virtual void respondToMemAlarm(); virtual bool handleMemResp(PacketPtr pkt); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index cb71b73c60..a045bbdead 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -93,6 +93,11 @@ PushEngine::ReqPort::recvReqRetry() sendPacket(blockedPacket); if (!_blocked) { + DPRINTF(MPU, "%s: Sent the blockedPacket. " + "_blocked: %s, (blockedPacket == nullptr): %s.\n", + __func__, _blocked ? "true" : "false", + (blockedPacket == nullptr) ? "true" : "false"); + blockedPacket = nullptr; } } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index c93b3b386d..2c17501d5b 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -140,6 +140,7 @@ class PushEngine : public BaseMemEngine PushStats stats; protected: + virtual int respBuffSize() { return memRespQueue.size(); } virtual void respondToMemAlarm(); virtual bool handleMemResp(PacketPtr pkt); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 75ac4f784e..55a9147ac9 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -136,6 +136,9 @@ WLEngine::processNextReadEvent() DPRINTF(MPU, "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n", __func__, update_addr); if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) { + DPRINTF(MPU, "%s: Entry available in onTheFlyUpdateMap. " + "onTheFlyUpdateMap.size: %lu.\n", + __func__, onTheFlyUpdateMap.size()); if (coalesceEngine->recvReadAddr(update_addr)) { onTheFlyUpdateMap[update_addr] = update_value; DPRINTF(MPU, "%s: Added a new item to onTheFlyUpdateMap. " @@ -147,6 +150,10 @@ WLEngine::processNextReadEvent() __func__, updateQueue.size()); respPort.checkRetryReq(); } + } else { + DPRINTF(MPU, "%s: No entries available in onTheFlyUpdateMap. " + "onTheFlyUpdateMap.size: %lu.\n", __func__, + onTheFlyUpdateMap.size()); } } else { // TODO: Generalize this to reduce function rather than just min @@ -209,8 +216,9 @@ WLEngine::processNextReduceEvent() coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]); onTheFlyUpdateMap.erase(addr); - DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n", - __func__, addr); + DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap. " + "onTheFlyUpdateMap.size: %lu.\n", + __func__, addr, onTheFlyUpdateMap.size()); } addrWorkListMap.clear(); } From dd056de8c00f33db13d14350910c5de8d6908c19 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 8 Jul 2022 10:36:46 -0700 Subject: [PATCH 109/247] Adding bit vector implementation for caching push meta data. --- configs/accl/sega.py | 7 +- src/accl/graph/base/base_mem_engine.cc | 10 +- src/accl/graph/base/data_structs.hh | 86 +++++++++------- src/accl/graph/sega/CoalesceEngine.py | 3 + src/accl/graph/sega/coalesce_engine.cc | 137 ++++++++++++++++++------- src/accl/graph/sega/coalesce_engine.hh | 14 ++- src/accl/graph/sega/push_engine.cc | 62 ++++++++--- src/accl/graph/sega/push_engine.hh | 12 ++- 8 files changed, 227 insertions(+), 104 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index a5dd759f1f..96408aa185 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -9,7 +9,7 @@ class MPU(SubSystem): def __init__(self, base_edge_addr): super(MPU, self).__init__() self.push_engine = PushEngine(base_edge_addr=base_edge_addr, - push_req_queue_size=1, + push_req_queue_size=0, attached_memory_atom_size=64, outstanding_mem_req_queue_size=1, resp_queue_size=1) @@ -19,8 +19,7 @@ def __init__(self, base_edge_addr): cache_size="1MiB", num_mshr_entry=1, num_tgts_per_mshr=1, - outstanding_mem_req_queue_size=1, - resp_queue_size=1) + outstanding_mem_req_queue_size=2) self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, update_queue_size=1, on_the_fly_update_map_size=1) @@ -77,7 +76,7 @@ def __init__(self, ) edge_mem_ctrl.append( SimpleMemory(range=self._edge_ranges[i], - bandwidth="19.2GB/s", + bandwidth="4.8GB/s", latency="30ns", image_file=f"{graph_path}/edgelist_{i}") ) diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc index 112b0d63cb..3086b81fc2 100644 --- a/src/accl/graph/base/base_mem_engine.cc +++ b/src/accl/graph/base/base_mem_engine.cc @@ -29,7 +29,6 @@ #include "accl/graph/base/base_mem_engine.hh" #include "debug/MPU.hh" -#include "debug/SEGAQSize.hh" namespace gem5 { @@ -102,8 +101,8 @@ BaseMemEngine::processNextMemReqEvent() return; } - if ((respBuffSize() == -1) || - ((respBuffSize() + onTheFlyReqs) < respQueueSize)) { + if (((respBuffSize() + onTheFlyReqs) < respQueueSize) || + (respQueueSize == 0)) { PacketPtr pkt = outstandingMemReqQueue.front(); memPort.sendPacket(pkt); onTheFlyReqs++; @@ -111,8 +110,6 @@ BaseMemEngine::processNextMemReqEvent() "pkt->addr: %lu, pkt->size: %lu.\n", __func__, pkt->getAddr(), pkt->getSize()); outstandingMemReqQueue.pop_front(); - DPRINTF(SEGAQSize, "%s: outstandingMemReqQueue.size: %lu.\n", - __func__, outstandingMemReqQueue.size()); if (memAlarmRequested && (outstandingMemReqQueue.size() <= @@ -180,8 +177,7 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt) { panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n"); outstandingMemReqQueue.push_back(pkt); - DPRINTF(SEGAQSize, "%s: outstandingMemReqQueue.size: %lu.\n", - __func__, outstandingMemReqQueue.size()); + assert(!outstandingMemReqQueue.empty()); if (!nextMemReqEvent.scheduled()) { schedule(nextMemReqEvent, nextCycle()); diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index e03686a7e9..e30d6029cb 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -32,8 +32,9 @@ #include "base/cprintf.hh" #include "base/intmath.hh" -#include +#include #include +#include namespace gem5 { @@ -91,49 +92,64 @@ static_assert(isPowerOf2(sizeof(Edge))); template class FIFOSet { - private: - std::queue fifo; - std::unordered_set set; - - public: - FIFOSet(int cap) - { - set.reserve(cap); - } + private: + std::queue fifo; + std::unordered_set set; - void push_back(T item) - { - if (set.find(item) == set.end()) { - set.insert(item); - fifo.push(item); - } - } + public: + FIFOSet(int cap) + { + set.reserve(cap); + } - void pop_front() - { - T front = fifo.front(); - set.erase(front); - fifo.pop(); + void push_back(T item) + { + if (set.find(item) == set.end()) { + set.insert(item); + fifo.push(item); } + } - T& front() - { - return fifo.front(); - } + void pop_front() + { + T front = fifo.front(); + set.erase(front); + fifo.pop(); + } - size_t size() { - return fifo.size(); - } + T& front() + { + return fifo.front(); + } - bool empty() { - return fifo.empty(); - } + size_t size() { + return fifo.size(); + } - bool find(T item) { - return (set.find(item) != set.end()); - } + bool empty() { + return fifo.empty(); + } + + bool find(T item) { + return (set.find(item) != set.end()); + } }; +// template +// class BitVector +// { +// private: +// int it; +// std::bitset bitStore; + +// public: +// BitVector(): it(0) { bitStore.reset(); } + +// uint32_t next() { + +// } +// }; + } #endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__ diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index 086f284950..7667a22c5a 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -41,4 +41,7 @@ class CoalesceEngine(BaseMemEngine): num_mshr_entry = Param.Int(4, "") num_tgts_per_mshr = Param.Int(20, "") + # Don't change. If changed. It will break functionality of coalesce. + resp_queue_size = 0 + diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index ee1e3f85ff..b5eeae694e 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -44,7 +44,6 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), numMSHREntry(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), - pendingPushAlarm(false), applyQueue(numLines), evictQueue(numLines), nextRespondEvent([this] { processNextRespondEvent(); }, name()), @@ -58,7 +57,9 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): cacheBlocks[i] = Block(numElementsPerLine); } - peerPushEngine->registerCoalesceEngine(this); + peerPushEngine->registerCoalesceEngine(this, numElementsPerLine); + + needsApply.reset(); } void @@ -67,6 +68,38 @@ CoalesceEngine::recvFunctional(PacketPtr pkt) sendMemFunctional(pkt); } +void +CoalesceEngine::startup() +{ + AddrRangeList vertex_ranges = getAddrRanges(); + + bool found = false; + Addr first_match_addr = 0; + while(!found) { + for (auto range: vertex_ranges) { + if (range.contains(first_match_addr)) { + found = true; + break; + } + } + first_match_addr += peerMemoryAtomSize; + } + + found = false; + Addr second_match_addr = first_match_addr + peerMemoryAtomSize; + while(!found) { + for (auto range: vertex_ranges) { + if (range.contains(second_match_addr)) { + found = true; + break; + } + } + second_match_addr += peerMemoryAtomSize; + } + + nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize); +} + void CoalesceEngine::registerWLEngine(WLEngine* wl_engine) { @@ -150,8 +183,7 @@ CoalesceEngine::recvReadAddr(Addr addr) "applyQueue.size = %u.\n", __func__, block_index, applyQueue.size()); assert(!applyQueue.empty()); - if ((!nextApplyEvent.scheduled()) && - (!pendingPushAlarm)) { + if ((!nextApplyEvent.scheduled())) { schedule(nextApplyEvent, nextCycle()); } } @@ -363,18 +395,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) cacheBlocks[block_index].items[wl_offset].to_string()); // TODO: Make this more general and programmable. - if ((cacheBlocks[block_index].busyMask == 0)) { + if ((cacheBlocks[block_index].busyMask == 0)) {(aligned_addr / peerMemoryAtomSize) % numLines; DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]." " It does not have any taken items anymore.\n", __func__, block_index); - // TODO: Fix this hack applyQueue.push_back(block_index); DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n", __func__, block_index, applyQueue.size()); } if ((!applyQueue.empty()) && - (!pendingPushAlarm) && (!nextApplyEvent.scheduled())) { schedule(nextApplyEvent, nextCycle()); } @@ -393,14 +423,7 @@ CoalesceEngine::processNextApplyEvent() stats.falseApplySchedules++; } else if (!cacheBlocks[block_index].dirty) { DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply " - "needed. Adding the cache line to evict schedule.\n", - __func__, block_index); - if (cacheBlocks[block_index].hasConflict) { - evictQueue.push_back(block_index); - assert(!evictQueue.empty()); - DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n", - __func__, block_index, evictQueue.size()); - } + "needed.\n", __func__, block_index); } else { for (int i = 0; i < numElementsPerLine; i++) { uint32_t old_prop = cacheBlocks[block_index].items[i].prop; @@ -409,31 +432,38 @@ CoalesceEngine::processNextApplyEvent() cacheBlocks[block_index].items[i].tempProp); if (new_prop != old_prop) { - if (peerPushEngine->allocatePushSpace()) { - cacheBlocks[block_index].items[i].tempProp = new_prop; - cacheBlocks[block_index].items[i].prop = new_prop; - DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n", - __func__, + cacheBlocks[block_index].items[i].tempProp = new_prop; + cacheBlocks[block_index].items[i].prop = new_prop; + DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n", __func__, cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)), cacheBlocks[block_index].items[i].to_string()); - peerPushEngine->recvWLItem( - cacheBlocks[block_index].items[i]); - DPRINTF(MPU, "%s: Sent WorkListItem [%d] to PushEngine.\n", - __func__, - cacheBlocks[block_index].addr + i * sizeof(WorkListItem)); + + Addr block_addr = cacheBlocks[block_index].addr; + int atom_index = (int) (block_addr / (peerMemoryAtomSize * nmpu)); + int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem)); + int bit_index = atom_index * block_bits + i; + + if (needsApply[bit_index] == 1) { + DPRINTF(MPU, "%s: WorkListItem[%lu] already set in bit-vector." + " Not doing anything further.\n", __func__, + block_addr + (i * sizeof(WorkListItem))); } else { - peerPushEngine->setPushAlarm(); - pendingPushAlarm = true; - return; + if (peerPushEngine->allocatePushSpace()) { + peerPushEngine->recvWLItem( + cacheBlocks[block_index].items[i]); + } else { + needsApply[bit_index] = 1; + } } } } - // TODO: This is where eviction policy goes - if (cacheBlocks[block_index].hasConflict){ - evictQueue.push_back(block_index); - DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n", - __func__, block_index, evictQueue.size()); - } + } + + // TODO: This is where eviction policy goes + if (cacheBlocks[block_index].hasConflict){ + evictQueue.push_back(block_index); + DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n", + __func__, block_index, evictQueue.size()); } applyQueue.pop_front(); @@ -536,9 +566,42 @@ CoalesceEngine::processNextEvictEvent() void CoalesceEngine::respondToPushAlarm() { - assert(pendingPushAlarm && (!nextApplyEvent.scheduled())); - pendingPushAlarm = false; - schedule(nextApplyEvent, nextCycle()); + DPRINTF(MPU, "%s: Received a Push alarm.\n", __func__); + int it; + for (it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) { + uint32_t slice = 0; + for (int i = 0; i < numElementsPerLine; i++) { + slice <<= 1; + slice |= needsApply[it + i]; + } + if (slice) { + break; + } + } + DPRINTF(MPU, "%s: Found slice %u at %d position in needsApply.\n", + __func__, slice, it); + + Addr block_addr = (nmpu * peerMemoryAtomSize) * + ((int)(it / (peerMemoryAtomSize / sizeof(WorkListItem)))); + int block_index = ((int) (block_addr / peerMemoryAtomSize)) % numLines; + + if ((cacheBlocks[block_index].addr == block_addr) && + (cacheBlocks[block_index].valid)) { + // hit in cache + bool do_push = cacheBlocks[block_index].busyMask == 0 ? true : false; + for (int i = 0; i < numElementsPerLine; i++) { + peerPushEngine->recvWLItemRetry( + cacheBlocks[block_index].items[i], do_push); + } + + // TODO: Should we add block_index to evict_queue? + if (do_push && cacheBlocks[block_index].hasConflict) { + evictQueue.push_back(block_index); + } + } else { + PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize); + + } } CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 1e353c11b8..e6c70502af 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -29,12 +29,16 @@ #ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ +#include + #include "accl/graph/base/base_mem_engine.hh" #include "accl/graph/base/data_structs.hh" #include "accl/graph/sega/push_engine.hh" #include "base/statistics.hh" #include "params/CoalesceEngine.hh" +#define MAX_BITVECTOR_SIZE (1 << 30) + // TODO: Add parameters for size, memory atom size, type size, // length of items in the blocks. namespace gem5 @@ -68,6 +72,7 @@ class CoalesceEngine : public BaseMemEngine items = new WorkListItem [num_elements]; } }; + int nmpu; WLEngine* peerWLEngine; PushEngine* peerPushEngine; @@ -83,8 +88,9 @@ class CoalesceEngine : public BaseMemEngine std::deque> responseQueue; - bool pendingPushAlarm; FIFOSet applyQueue; + int needsApplyFirstPointer; + std::bitset needsApply; FIFOSet evictQueue; @@ -127,14 +133,16 @@ class CoalesceEngine : public BaseMemEngine CoalesceEngine(const CoalesceEngineParams ¶ms); - void recvFunctional(PacketPtr pkt); - bool recvReadAddr(Addr addr); void recvWLWrite(Addr addr, WorkListItem wl); void registerWLEngine(WLEngine* wl_engine); void respondToPushAlarm(); + + void recvFunctional(PacketPtr pkt); + + virtual void startup(); }; } diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index a045bbdead..8bc2d55a28 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -30,6 +30,7 @@ #include "accl/graph/sega/coalesce_engine.hh" #include "debug/MPU.hh" +#include "debug/SEGAQSize.hh" #include "mem/packet_access.hh" namespace gem5 @@ -37,9 +38,10 @@ namespace gem5 PushEngine::PushEngine(const PushEngineParams ¶ms): BaseMemEngine(params), - pushAlarmSet(false), + retrySpaceAllocated(0), reqPort(name() + ".req_port", this), baseEdgeAddr(params.base_edge_addr), + numRetries(0), pushReqQueueSize(params.push_req_queue_size), nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()), nextPushEvent([this] { processNextPushEvent(); }, name()), @@ -59,9 +61,11 @@ PushEngine::getPort(const std::string &if_name, PortID idx) } void -PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine) +PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine, + int elements_per_line) { peerCoalesceEngine = coalesce_engine; + numElementsPerLine = elements_per_line; } void @@ -115,15 +119,21 @@ PushEngine::recvWLItem(WorkListItem wl) assert((pushReqQueueSize == 0) || (pushReqQueue.size() < pushReqQueueSize)); - panic_if(pushReqQueue.size() == pushReqQueueSize, "You should call this " - "method after checking if there is enough push space. Use " - "allocatePushSpace.\n"); + panic_if((pushReqQueue.size() == pushReqQueueSize) && + (pushReqQueueSize != 0), "You should call this method after " + "checking if there is enough push space. Use allocatePushSpace.\n"); Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge)); Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); uint32_t value = wl.prop; - pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), peerMemoryAtomSize, value); + pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), + peerMemoryAtomSize, value); + + if (curTick() % 50000 == 0) { + DPRINTF(SEGAQSize, "%s: pushReqQueue.size: %lu.\n", + __func__, pushReqQueue.size()); + } assert(!pushReqQueue.empty()); if ((!nextAddrGenEvent.scheduled()) && @@ -132,6 +142,25 @@ PushEngine::recvWLItem(WorkListItem wl) } } +void +PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push) +{ + if (do_push) { + Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge)); + Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); + uint32_t value = wl.prop; + + pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), + peerMemoryAtomSize, value); + numRetries--; + } + retrySpaceAllocated--; + if ((!nextAddrGenEvent.scheduled()) && + (!memReqQueueFull())) { + schedule(nextAddrGenEvent, nextCycle()); + } +} + void PushEngine::processNextAddrGenEvent() { @@ -158,8 +187,10 @@ PushEngine::processNextAddrGenEvent() DPRINTF(MPU, "%s: Popped curr_info from pushReqQueue. " "pushReqQueue.size() = %u.\n", __func__, pushReqQueue.size()); - if (pushAlarmSet && (pushReqQueue.size() == pushReqQueueSize - 1)) { - pushAlarmSet = false; + if (numRetries > 0) { + retrySpaceAllocated++; + } + if ((retrySpaceAllocated % numElementsPerLine) == 0) { peerCoalesceEngine->respondToPushAlarm(); } } @@ -261,17 +292,20 @@ PushEngine::createUpdatePacket(Addr addr, T value) return pkt; } -void -PushEngine::setPushAlarm() -{ - assert(!pushAlarmSet); - pushAlarmSet = true; +bool +PushEngine::allocatePushSpace() { + if ((pushReqQueueSize == 0) || + ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) { + return true; + } else { + numRetries++; + return false; + } } PushEngine::PushStats::PushStats(PushEngine &_push) : statistics::Group(&_push), push(_push), - ADD_STAT(numUpdates, statistics::units::Count::get(), "Number of sent updates.") { diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 2c17501d5b..4f388cd7e6 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -98,13 +98,15 @@ class PushEngine : public BaseMemEngine virtual void recvReqRetry(); }; - bool pushAlarmSet; + int numElementsPerLine; + int retrySpaceAllocated; CoalesceEngine* peerCoalesceEngine; ReqPort reqPort; Addr baseEdgeAddr; + int numRetries; int pushReqQueueSize; std::deque pushReqQueue; @@ -151,12 +153,14 @@ class PushEngine : public BaseMemEngine Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; - bool allocatePushSpace() { return pushReqQueue.size() < pushReqQueueSize; } + bool allocatePushSpace(); + void recvWLItem(WorkListItem wl); - void registerCoalesceEngine(CoalesceEngine* coalesce_engine); + void recvWLItemRetry(WorkListItem wl, bool do_push); - void setPushAlarm(); + void registerCoalesceEngine(CoalesceEngine* coalesce_engine, + int elements_per_line); }; } From 7a351854013b45cfe260990b60dbc160e1aac24a Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 17 Jul 2022 16:12:07 -0700 Subject: [PATCH 110/247] Completing retry between coalesce and push engine. --- configs/accl/sega.py | 4 +- src/accl/graph/SConscript | 1 + src/accl/graph/TODO.md | 7 +- src/accl/graph/base/base_mem_engine.cc | 13 ++- src/accl/graph/base/data_structs.hh | 3 +- src/accl/graph/sega/coalesce_engine.cc | 155 +++++++++++++++++++------ src/accl/graph/sega/coalesce_engine.hh | 12 ++ src/accl/graph/sega/push_engine.cc | 11 +- 8 files changed, 157 insertions(+), 49 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 96408aa185..65645b3bb3 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -9,7 +9,7 @@ class MPU(SubSystem): def __init__(self, base_edge_addr): super(MPU, self).__init__() self.push_engine = PushEngine(base_edge_addr=base_edge_addr, - push_req_queue_size=0, + push_req_queue_size=4, attached_memory_atom_size=64, outstanding_mem_req_queue_size=1, resp_queue_size=1) @@ -19,7 +19,7 @@ def __init__(self, base_edge_addr): cache_size="1MiB", num_mshr_entry=1, num_tgts_per_mshr=1, - outstanding_mem_req_queue_size=2) + outstanding_mem_req_queue_size=0) self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, update_queue_size=1, on_the_fly_update_map_size=1) diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript index 9663d3f263..36e16affa3 100644 --- a/src/accl/graph/SConscript +++ b/src/accl/graph/SConscript @@ -29,3 +29,4 @@ Import('*') DebugFlag('MPU') DebugFlag('SEGAQSize') +DebugFlag('MahyarMath') diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md index 29b5a2939e..ebfca7e794 100644 --- a/src/accl/graph/TODO.md +++ b/src/accl/graph/TODO.md @@ -1,5 +1,8 @@ # TODO Items -* Replace std::floor with roundDown from intmath.hh in src * We might need to revisit the fact that we could insert something to a queue on - the same cycle that another event is consuming something from the queue. \ No newline at end of file + the same cycle that another event is consuming something from the queue. +* Move checking for wl.degree == 0 to coalesce engine. +* Fix the retry system between memory queue and coalesce engine +* Update inheritance: There is not enough reason for PushEngine and +CoalesceEngine to be of the same type (i.e. delete BaseMemEngine). diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc index 3086b81fc2..64aaa3a737 100644 --- a/src/accl/graph/base/base_mem_engine.cc +++ b/src/accl/graph/base/base_mem_engine.cc @@ -159,17 +159,22 @@ BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data) bool BaseMemEngine::allocateMemReqSpace(int space) { - assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize); + assert((outstandingMemReqQueueSize == 0) || + (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize)); return ( - outstandingMemReqQueue.size() <= (outstandingMemReqQueueSize - space) + (outstandingMemReqQueueSize == 0) || + (outstandingMemReqQueue.size() <= (outstandingMemReqQueueSize - space)) ); } bool BaseMemEngine::memReqQueueFull() { - assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize); - return (outstandingMemReqQueue.size() == outstandingMemReqQueueSize); + assert((outstandingMemReqQueueSize == 0) || + (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize)); + return ( + (outstandingMemReqQueueSize != 0) && + (outstandingMemReqQueue.size() == outstandingMemReqQueueSize)); } void diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index e30d6029cb..9c250c6a2f 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -93,6 +93,7 @@ template class FIFOSet { private: + // int numInvalids; std::queue fifo; std::unordered_set set; @@ -127,7 +128,7 @@ class FIFOSet } bool empty() { - return fifo.empty(); + return (size() == 0); } bool find(T item) { diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index b5eeae694e..1c3f2bcadf 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -31,6 +31,7 @@ #include "accl/graph/sega/wl_engine.hh" #include "base/intmath.hh" #include "debug/ApplyUpdates.hh" +#include "debug/MahyarMath.hh" #include "debug/MPU.hh" #include "mem/packet_access.hh" @@ -75,29 +76,39 @@ CoalesceEngine::startup() bool found = false; Addr first_match_addr = 0; - while(!found) { + while(true) { for (auto range: vertex_ranges) { if (range.contains(first_match_addr)) { found = true; break; } } + if (found) { + break; + } first_match_addr += peerMemoryAtomSize; } found = false; Addr second_match_addr = first_match_addr + peerMemoryAtomSize; - while(!found) { + while(true) { for (auto range: vertex_ranges) { if (range.contains(second_match_addr)) { found = true; break; } } + if (found) { + break; + } second_match_addr += peerMemoryAtomSize; } nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize); + memoryAddressOffset = first_match_addr; + DPRINTF(MahyarMath, "%s: Initialized address translation information." + " nmpu: %d, memoryAddressOffset: %lu.\n", + __func__, nmpu, memoryAddressOffset); } void @@ -106,6 +117,40 @@ CoalesceEngine::registerWLEngine(WLEngine* wl_engine) peerWLEngine = wl_engine; } +// addr should be aligned to peerMemoryAtomSize +int +CoalesceEngine::getBlockIndex(Addr addr) +{ + return ((int) (addr / peerMemoryAtomSize)) % numLines; +} + +// addr should be aligned to peerMemoryAtomSize +int +CoalesceEngine::getBitIndexBase(Addr addr) +{ + DPRINTF(MahyarMath, "%s: Calculating BitIndexBase for addr %lu.\n", + __func__, addr); + int atom_index = (int) (addr / (peerMemoryAtomSize * nmpu)); + int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem)); + int bit_index = atom_index * block_bits; + DPRINTF(MahyarMath, "%s: BitIndexBase for addr %lu is %d.\n", + __func__, addr, bit_index); + return bit_index; +} + +// index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem)) +Addr +CoalesceEngine::getBlockAddrFromBitIndex(int index) +{ + DPRINTF(MahyarMath, "%s: Calculating BlockAddr for index %d.\n", + __func__, index); + Addr block_addr = (nmpu * peerMemoryAtomSize) * + ((int)(index / (peerMemoryAtomSize / sizeof(WorkListItem)))); + DPRINTF(MahyarMath, "%s: BlockAddr for index %d is %lu.\n", + __func__, index, (block_addr + memoryAddressOffset)); + return (block_addr + memoryAddressOffset); +} + bool CoalesceEngine::recvReadAddr(Addr addr) { @@ -298,6 +343,31 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) return true; } + if (pkt->findNextSenderState()) { + Addr addr = pkt->getAddr(); + int it = getBitIndexBase(addr); + int block_index = getBlockIndex(addr); + bool found_in_cache = (cacheBlocks[block_index].addr == addr); + + // We have to send the items regardless of them being found in the + // cache. However, if they are found in the cache, two things should + // happen. First, do_push should be set to false and the bit vector + // value for the items should not change. To future Mahyar and Marjan, + // If this is confusing, please look at where each item is pushed to + // the apply queue. Hint: Think about updates that might not be sent + // out if you reset the bit regardless of the line being found in the + // cache. + WorkListItem* items = pkt->getPtr(); + for (int i = 0; i < numElementsPerLine; i++) { + needsApply[it + i] = + (needsApply[it + i] == 1) && found_in_cache ? 1 : 0; + + peerPushEngine->recvWLItemRetry(items[i], + ((!found_in_cache) && needsApply[it + i])); + } + return true; + } + Addr addr = pkt->getAddr(); int block_index = (addr / peerMemoryAtomSize) % numLines; @@ -395,11 +465,15 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) cacheBlocks[block_index].items[wl_offset].to_string()); // TODO: Make this more general and programmable. - if ((cacheBlocks[block_index].busyMask == 0)) {(aligned_addr / peerMemoryAtomSize) % numLines; + if ((cacheBlocks[block_index].busyMask == 0)) { DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]." " It does not have any taken items anymore.\n", __func__, block_index); applyQueue.push_back(block_index); + int bit_index = getBitIndexBase(cacheBlocks[block_index].addr); + for (int i = 0; i < numElementsPerLine; i++) { + needsApply[bit_index + i] = 0; + } DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n", __func__, block_index, applyQueue.size()); } @@ -438,22 +512,15 @@ CoalesceEngine::processNextApplyEvent() cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)), cacheBlocks[block_index].items[i].to_string()); - Addr block_addr = cacheBlocks[block_index].addr; - int atom_index = (int) (block_addr / (peerMemoryAtomSize * nmpu)); - int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem)); - int bit_index = atom_index * block_bits + i; + int bit_index = + getBitIndexBase(cacheBlocks[block_index].addr) + i; - if (needsApply[bit_index] == 1) { - DPRINTF(MPU, "%s: WorkListItem[%lu] already set in bit-vector." - " Not doing anything further.\n", __func__, - block_addr + (i * sizeof(WorkListItem))); + assert(needsApply[bit_index] == 0); + if (peerPushEngine->allocatePushSpace()) { + peerPushEngine->recvWLItem( + cacheBlocks[block_index].items[i]); } else { - if (peerPushEngine->allocatePushSpace()) { - peerPushEngine->recvWLItem( - cacheBlocks[block_index].items[i]); - } else { - needsApply[bit_index] = 1; - } + needsApply[bit_index] = 1; } } } @@ -567,40 +634,56 @@ void CoalesceEngine::respondToPushAlarm() { DPRINTF(MPU, "%s: Received a Push alarm.\n", __func__); - int it; + Addr block_addr = 0; + int block_index = 0; + int it = 0; + uint32_t slice = 0; + bool hit_in_cache = false; for (it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) { - uint32_t slice = 0; for (int i = 0; i < numElementsPerLine; i++) { slice <<= 1; slice |= needsApply[it + i]; } if (slice) { - break; + block_addr = getBlockAddrFromBitIndex(it); + block_index = ((int) (block_addr / peerMemoryAtomSize)) % numLines; + if ((cacheBlocks[block_index].addr == block_addr) && + (cacheBlocks[block_index].valid)) { + if (cacheBlocks[block_index].busyMask == 0) { + hit_in_cache = true; + break; + } + } else { + hit_in_cache = false; + break; + } } } + + assert(it < MAX_BITVECTOR_SIZE); + DPRINTF(MPU, "%s: Found slice %u at %d position in needsApply.\n", __func__, slice, it); - Addr block_addr = (nmpu * peerMemoryAtomSize) * - ((int)(it / (peerMemoryAtomSize / sizeof(WorkListItem)))); - int block_index = ((int) (block_addr / peerMemoryAtomSize)) % numLines; - - if ((cacheBlocks[block_index].addr == block_addr) && - (cacheBlocks[block_index].valid)) { - // hit in cache - bool do_push = cacheBlocks[block_index].busyMask == 0 ? true : false; + if (hit_in_cache) { for (int i = 0; i < numElementsPerLine; i++) { - peerPushEngine->recvWLItemRetry( - cacheBlocks[block_index].items[i], do_push); - } - - // TODO: Should we add block_index to evict_queue? - if (do_push && cacheBlocks[block_index].hasConflict) { - evictQueue.push_back(block_index); + peerPushEngine->recvWLItemRetry(cacheBlocks[block_index].items[i], + (needsApply[it + i] == 1)); + needsApply[it + i] = 0; } } else { + // FIXME: Fix the retry mechanism between memory and cache to + // handle memory retries correctly. This probably requires scheduling + // an event for sending the retry. For now we're enabling infinite + // queueing in the outstandingMemReqQueue. PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize); - + SenderState* sender_state = new SenderState(true); + pkt->pushSenderState(sender_state); + if (allocateMemReqSpace(1)) { + enqueueMemReq(pkt); + } else { + requestMemAlarm(1); + } } } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index e6c70502af..973ea479c1 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -72,7 +72,15 @@ class CoalesceEngine : public BaseMemEngine items = new WorkListItem [num_elements]; } }; + + struct SenderState : public Packet::SenderState + { + bool isRetry; + SenderState(bool is_retry): isRetry(is_retry) {} + }; + int nmpu; + Addr memoryAddressOffset; WLEngine* peerWLEngine; PushEngine* peerPushEngine; @@ -94,6 +102,10 @@ class CoalesceEngine : public BaseMemEngine FIFOSet evictQueue; + int getBlockIndex(Addr addr); + int getBitIndexBase(Addr addr); + Addr getBlockAddrFromBitIndex(int index); + EventFunctionWrapper nextRespondEvent; void processNextRespondEvent(); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 8bc2d55a28..fa611392b4 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -149,9 +149,13 @@ PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push) Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge)); Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); uint32_t value = wl.prop; - - pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), - peerMemoryAtomSize, value); + if (wl.degree != 0) { + pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), + peerMemoryAtomSize, value); + } else { + DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n", + __func__, wl.to_string()); + } numRetries--; } retrySpaceAllocated--; @@ -164,7 +168,6 @@ PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push) void PushEngine::processNextAddrGenEvent() { - Addr aligned_addr, offset; int num_edges; From 2b9604dc53c675f1e4fc943c162e43929ff0af27 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 19 Jul 2022 07:33:16 -0700 Subject: [PATCH 111/247] Updating variable names and debug flags. --- src/accl/graph/SConscript | 3 +- src/accl/graph/base/base_mem_engine.cc | 20 ++--- src/accl/graph/base/base_mem_engine.hh | 12 +-- src/accl/graph/base/data_structs.hh | 33 +++----- src/accl/graph/sega/SConscript | 3 + src/accl/graph/sega/coalesce_engine.cc | 100 ++++++++++++++----------- src/accl/graph/sega/coalesce_engine.hh | 9 +-- src/accl/graph/sega/push_engine.cc | 53 ++++++------- src/accl/graph/sega/push_engine.hh | 2 +- src/accl/graph/sega/wl_engine.cc | 2 +- 10 files changed, 113 insertions(+), 124 deletions(-) diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript index 36e16affa3..7ca60c30bd 100644 --- a/src/accl/graph/SConscript +++ b/src/accl/graph/SConscript @@ -28,5 +28,4 @@ Import('*') DebugFlag('MPU') -DebugFlag('SEGAQSize') -DebugFlag('MahyarMath') +# CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine']) diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc index 64aaa3a737..32c314033d 100644 --- a/src/accl/graph/base/base_mem_engine.cc +++ b/src/accl/graph/base/base_mem_engine.cc @@ -40,7 +40,7 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams ¶ms): outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size), onTheFlyReqs(0), respQueueSize(params.resp_queue_size), - memAlarmRequested(false), + memRetryRequested(false), memSpaceRequested(0), nextMemReqEvent([this] { processNextMemReqEvent(); }, name()), _requestorId(system->getRequestorId(this)), @@ -111,12 +111,12 @@ BaseMemEngine::processNextMemReqEvent() __func__, pkt->getAddr(), pkt->getSize()); outstandingMemReqQueue.pop_front(); - if (memAlarmRequested && + if (memRetryRequested && (outstandingMemReqQueue.size() <= (outstandingMemReqQueueSize - memSpaceRequested))) { - memAlarmRequested = false; + memRetryRequested = false; memSpaceRequested = 0; - respondToMemAlarm(); + recvMemRetry(); } } @@ -157,7 +157,7 @@ BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data) } bool -BaseMemEngine::allocateMemReqSpace(int space) +BaseMemEngine::allocateMemQueueSpace(int space) { assert((outstandingMemReqQueueSize == 0) || (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize)); @@ -168,7 +168,7 @@ BaseMemEngine::allocateMemReqSpace(int space) } bool -BaseMemEngine::memReqQueueFull() +BaseMemEngine::memQueueFull() { assert((outstandingMemReqQueueSize == 0) || (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize)); @@ -180,7 +180,7 @@ BaseMemEngine::memReqQueueFull() void BaseMemEngine::enqueueMemReq(PacketPtr pkt) { - panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n"); + panic_if(memQueueFull(), "Should not enqueue if queue full.\n"); outstandingMemReqQueue.push_back(pkt); assert(!outstandingMemReqQueue.empty()); @@ -190,12 +190,12 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt) } void -BaseMemEngine::requestMemAlarm(int space) { - panic_if((memAlarmRequested == true) || (memSpaceRequested != 0), +BaseMemEngine::requestMemRetry(int space) { + panic_if((memRetryRequested == true) || (memSpaceRequested != 0), "You should not request another alarm without the first one being" "responded to.\n"); DPRINTF(MPU, "%s: Alarm requested with space = %d.\n", __func__, space); - memAlarmRequested = true; + memRetryRequested = true; memSpaceRequested = space; } diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh index fc67f3f6d8..64ef49ee1d 100644 --- a/src/accl/graph/base/base_mem_engine.hh +++ b/src/accl/graph/base/base_mem_engine.hh @@ -71,7 +71,7 @@ class BaseMemEngine : public ClockedObject int outstandingMemReqQueueSize; int onTheFlyReqs; int respQueueSize; - bool memAlarmRequested; + bool memRetryRequested; int memSpaceRequested; std::deque outstandingMemReqQueue; @@ -83,17 +83,17 @@ class BaseMemEngine : public ClockedObject size_t peerMemoryAtomSize; - bool allocateMemReqSpace(int space); - bool memReqQueueFull(); + bool allocateMemQueueSpace(int space); + bool memQueueFull(); - bool pendingMemAlarm() { return memAlarmRequested; } - void requestMemAlarm(int space); + bool pendingMemRetry() { return memRetryRequested; } + void requestMemRetry(int space); void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } void enqueueMemReq(PacketPtr pkt); virtual int respBuffSize() = 0; - virtual void respondToMemAlarm() = 0; + virtual void recvMemRetry() = 0; virtual bool handleMemResp(PacketPtr pkt) = 0; PacketPtr createReadPacket(Addr addr, unsigned int size); diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index 9c250c6a2f..f938be72f1 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -93,8 +93,6 @@ template class FIFOSet { private: - // int numInvalids; - std::queue fifo; std::unordered_set set; public: @@ -107,24 +105,22 @@ class FIFOSet { if (set.find(item) == set.end()) { set.insert(item); - fifo.push(item); } } void pop_front() { - T front = fifo.front(); - set.erase(front); - fifo.pop(); + assert(set.begin() != set.end()); + set.erase(set.begin()); } - T& front() + T front() { - return fifo.front(); + return *(set.begin()); } size_t size() { - return fifo.size(); + return set.size(); } bool empty() { @@ -134,22 +130,11 @@ class FIFOSet bool find(T item) { return (set.find(item) != set.end()); } -}; - -// template -// class BitVector -// { -// private: -// int it; -// std::bitset bitStore; - -// public: -// BitVector(): it(0) { bitStore.reset(); } -// uint32_t next() { - -// } -// }; + void erase(T item) { + set.erase(item); + } +}; } diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index 16fab86ede..77e508f4ed 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -39,3 +39,6 @@ Source('wl_engine.cc') DebugFlag('ApplyUpdates') DebugFlag('CenteralController') +DebugFlag('CoalesceEngine') +DebugFlag('PushEngine') +DebugFlag('WLEngine') diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 1c3f2bcadf..66b8e1fad7 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -31,7 +31,6 @@ #include "accl/graph/sega/wl_engine.hh" #include "base/intmath.hh" #include "debug/ApplyUpdates.hh" -#include "debug/MahyarMath.hh" #include "debug/MPU.hh" #include "mem/packet_access.hh" @@ -60,7 +59,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): peerPushEngine->registerCoalesceEngine(this, numElementsPerLine); - needsApply.reset(); + needsPush.reset(); } void @@ -106,9 +105,6 @@ CoalesceEngine::startup() nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize); memoryAddressOffset = first_match_addr; - DPRINTF(MahyarMath, "%s: Initialized address translation information." - " nmpu: %d, memoryAddressOffset: %lu.\n", - __func__, nmpu, memoryAddressOffset); } void @@ -128,13 +124,9 @@ CoalesceEngine::getBlockIndex(Addr addr) int CoalesceEngine::getBitIndexBase(Addr addr) { - DPRINTF(MahyarMath, "%s: Calculating BitIndexBase for addr %lu.\n", - __func__, addr); int atom_index = (int) (addr / (peerMemoryAtomSize * nmpu)); int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem)); int bit_index = atom_index * block_bits; - DPRINTF(MahyarMath, "%s: BitIndexBase for addr %lu is %d.\n", - __func__, addr, bit_index); return bit_index; } @@ -142,17 +134,13 @@ CoalesceEngine::getBitIndexBase(Addr addr) Addr CoalesceEngine::getBlockAddrFromBitIndex(int index) { - DPRINTF(MahyarMath, "%s: Calculating BlockAddr for index %d.\n", - __func__, index); Addr block_addr = (nmpu * peerMemoryAtomSize) * ((int)(index / (peerMemoryAtomSize / sizeof(WorkListItem)))); - DPRINTF(MahyarMath, "%s: BlockAddr for index %d is %lu.\n", - __func__, index, (block_addr + memoryAddressOffset)); return (block_addr + memoryAddressOffset); } bool -CoalesceEngine::recvReadAddr(Addr addr) +CoalesceEngine::recvWLRead(Addr addr) { assert(MSHRMap.size() <= numMSHREntry); DPRINTF(MPU, "%s: Received a read request for address: %lu.\n", @@ -239,7 +227,7 @@ CoalesceEngine::recvReadAddr(Addr addr) DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to " "allocate a cache line for it.\n", __func__, addr); - if (memReqQueueFull()) { + if (memQueueFull()) { DPRINTF(MPU, "%s: No space in outstandingMemReqQueue. " "Rejecting request.\n", __func__); stats.readRejections++; @@ -326,7 +314,7 @@ CoalesceEngine::processNextRespondEvent() } void -CoalesceEngine::respondToMemAlarm() +CoalesceEngine::recvMemRetry() { assert(!nextEvictEvent.scheduled()); schedule(nextEvictEvent, nextCycle()); @@ -347,8 +335,16 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) Addr addr = pkt->getAddr(); int it = getBitIndexBase(addr); int block_index = getBlockIndex(addr); - bool found_in_cache = (cacheBlocks[block_index].addr == addr); + bool line_do_push = false; + if (cacheBlocks[block_index].addr == addr) { + if (cacheBlocks[block_index].busyMask == 0) { + assert(applyQueue.find(block_index)); + line_do_push = true; + } else { + line_do_push = false; + } + } // We have to send the items regardless of them being found in the // cache. However, if they are found in the cache, two things should // happen. First, do_push should be set to false and the bit vector @@ -359,11 +355,19 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) // cache. WorkListItem* items = pkt->getPtr(); for (int i = 0; i < numElementsPerLine; i++) { - needsApply[it + i] = - (needsApply[it + i] == 1) && found_in_cache ? 1 : 0; - + assert(!((needsPush[it + i] == 1) && (items[i].degree == 0))); + // TODO: Make this more programmable + uint32_t new_prop = std::min( + cacheBlocks[block_index].items[i].prop, + cacheBlocks[block_index].items[i].tempProp); + cacheBlocks[block_index].items[i].tempProp = new_prop; + cacheBlocks[block_index].items[i].prop = new_prop; peerPushEngine->recvWLItemRetry(items[i], - ((!found_in_cache) && needsApply[it + i])); + (line_do_push && needsPush[it + i])); + } + + if (applyQueue.find(block_index)) { + applyQueue.erase(block_index); } return true; } @@ -470,10 +474,6 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) " It does not have any taken items anymore.\n", __func__, block_index); applyQueue.push_back(block_index); - int bit_index = getBitIndexBase(cacheBlocks[block_index].addr); - for (int i = 0; i < numElementsPerLine; i++) { - needsApply[bit_index + i] = 0; - } DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n", __func__, block_index, applyQueue.size()); } @@ -488,6 +488,10 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) void CoalesceEngine::processNextApplyEvent() { + if (applyQueue.empty()) { + return; + } + int block_index = applyQueue.front(); if (cacheBlocks[block_index].busyMask) { @@ -514,13 +518,13 @@ CoalesceEngine::processNextApplyEvent() int bit_index = getBitIndexBase(cacheBlocks[block_index].addr) + i; - - assert(needsApply[bit_index] == 0); - if (peerPushEngine->allocatePushSpace()) { - peerPushEngine->recvWLItem( - cacheBlocks[block_index].items[i]); - } else { - needsApply[bit_index] = 1; + if (cacheBlocks[block_index].items[i].degree != 0) { + if (peerPushEngine->allocatePushSpace()) { + peerPushEngine->recvWLItem( + cacheBlocks[block_index].items[i]); + } else { + needsPush[bit_index] = 1; + } } } } @@ -536,7 +540,7 @@ CoalesceEngine::processNextApplyEvent() applyQueue.pop_front(); if ((!evictQueue.empty()) && - (!pendingMemAlarm()) && + (!pendingMemRetry()) && (!nextEvictEvent.scheduled())) { schedule(nextEvictEvent, nextCycle()); } @@ -562,13 +566,13 @@ CoalesceEngine::processNextEvictEvent() int space_needed = cacheBlocks[block_index].dirty ? (cacheBlocks[block_index].hasConflict ? 2 : 1) : (cacheBlocks[block_index].hasConflict ? 1 : 0); - if (!allocateMemReqSpace(space_needed)) { + if (!allocateMemQueueSpace(space_needed)) { DPRINTF(MPU, "%s: There is not enough space in memReqQueue to " "procees the eviction of cache line [%d]. dirty: %d, " "hasConflict: %d.\n", __func__, block_index, cacheBlocks[block_index].dirty, cacheBlocks[block_index].hasConflict); - requestMemAlarm(space_needed); + requestMemRetry(space_needed); return; } else { if (cacheBlocks[block_index].dirty) { @@ -631,7 +635,7 @@ CoalesceEngine::processNextEvictEvent() } void -CoalesceEngine::respondToPushAlarm() +CoalesceEngine::recvPushRetry() { DPRINTF(MPU, "%s: Received a Push alarm.\n", __func__); Addr block_addr = 0; @@ -639,14 +643,15 @@ CoalesceEngine::respondToPushAlarm() int it = 0; uint32_t slice = 0; bool hit_in_cache = false; + for (it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) { for (int i = 0; i < numElementsPerLine; i++) { slice <<= 1; - slice |= needsApply[it + i]; + slice |= needsPush[it + i]; } if (slice) { block_addr = getBlockAddrFromBitIndex(it); - block_index = ((int) (block_addr / peerMemoryAtomSize)) % numLines; + block_index = getBlockIndex(block_addr); if ((cacheBlocks[block_index].addr == block_addr) && (cacheBlocks[block_index].valid)) { if (cacheBlocks[block_index].busyMask == 0) { @@ -662,14 +667,23 @@ CoalesceEngine::respondToPushAlarm() assert(it < MAX_BITVECTOR_SIZE); - DPRINTF(MPU, "%s: Found slice %u at %d position in needsApply.\n", + DPRINTF(MPU, "%s: Found slice %u at %d position in needsPush.\n", __func__, slice, it); if (hit_in_cache) { for (int i = 0; i < numElementsPerLine; i++) { + // TODO: Make this more programmable + uint32_t new_prop = std::min( + cacheBlocks[block_index].items[i].prop, + cacheBlocks[block_index].items[i].tempProp); + cacheBlocks[block_index].items[i].tempProp = new_prop; + cacheBlocks[block_index].items[i].prop = new_prop; peerPushEngine->recvWLItemRetry(cacheBlocks[block_index].items[i], - (needsApply[it + i] == 1)); - needsApply[it + i] = 0; + (needsPush[it + i] == 1)); + needsPush[it + i] = 0; + } + if (applyQueue.find(block_index)) { + applyQueue.erase(block_index); } } else { // FIXME: Fix the retry mechanism between memory and cache to @@ -679,10 +693,10 @@ CoalesceEngine::respondToPushAlarm() PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize); SenderState* sender_state = new SenderState(true); pkt->pushSenderState(sender_state); - if (allocateMemReqSpace(1)) { + if (allocateMemQueueSpace(1)) { enqueueMemReq(pkt); } else { - requestMemAlarm(1); + requestMemRetry(1); } } } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 973ea479c1..0fa555c84a 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -97,8 +97,7 @@ class CoalesceEngine : public BaseMemEngine std::deque> responseQueue; FIFOSet applyQueue; - int needsApplyFirstPointer; - std::bitset needsApply; + std::bitset needsPush; FIFOSet evictQueue; @@ -137,7 +136,7 @@ class CoalesceEngine : public BaseMemEngine protected: virtual int respBuffSize() { return -1; } - virtual void respondToMemAlarm(); + virtual void recvMemRetry(); virtual bool handleMemResp(PacketPtr pkt); public: @@ -145,12 +144,12 @@ class CoalesceEngine : public BaseMemEngine CoalesceEngine(const CoalesceEngineParams ¶ms); - bool recvReadAddr(Addr addr); + bool recvWLRead(Addr addr); void recvWLWrite(Addr addr, WorkListItem wl); void registerWLEngine(WLEngine* wl_engine); - void respondToPushAlarm(); + void recvPushRetry(); void recvFunctional(PacketPtr pkt); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index fa611392b4..16e0ca6c6c 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -30,7 +30,7 @@ #include "accl/graph/sega/coalesce_engine.hh" #include "debug/MPU.hh" -#include "debug/SEGAQSize.hh" +#include "debug/PushEngine.hh" #include "mem/packet_access.hh" namespace gem5 @@ -109,13 +109,7 @@ PushEngine::ReqPort::recvReqRetry() void PushEngine::recvWLItem(WorkListItem wl) { - // If there are no outdoing edges, no need to generate and push - // updates. Therefore, we only need to return true. - if (wl.degree == 0) { - DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n", - __func__, wl.to_string()); - return; - } + assert(wl.degree != 0); assert((pushReqQueueSize == 0) || (pushReqQueue.size() < pushReqQueueSize)); @@ -123,6 +117,7 @@ PushEngine::recvWLItem(WorkListItem wl) (pushReqQueueSize != 0), "You should call this method after " "checking if there is enough push space. Use allocatePushSpace.\n"); + DPRINTF(PushEngine, "%s: Received %s.\n", __func__, wl.to_string()); Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge)); Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); uint32_t value = wl.prop; @@ -130,14 +125,9 @@ PushEngine::recvWLItem(WorkListItem wl) pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), peerMemoryAtomSize, value); - if (curTick() % 50000 == 0) { - DPRINTF(SEGAQSize, "%s: pushReqQueue.size: %lu.\n", - __func__, pushReqQueue.size()); - } - assert(!pushReqQueue.empty()); if ((!nextAddrGenEvent.scheduled()) && - (!memReqQueueFull())) { + (!memQueueFull())) { schedule(nextAddrGenEvent, nextCycle()); } } @@ -145,24 +135,22 @@ PushEngine::recvWLItem(WorkListItem wl) void PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push) { + DPRINTF(PushEngine, "%s: Received %s with do_push = %s.\n", + __func__, wl.to_string(), do_push ? "true" : "false"); if (do_push) { Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge)); Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); uint32_t value = wl.prop; - if (wl.degree != 0) { - pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), - peerMemoryAtomSize, value); - } else { - DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n", - __func__, wl.to_string()); - } + assert(wl.degree != 0); + pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), + peerMemoryAtomSize, value); numRetries--; + if ((!nextAddrGenEvent.scheduled()) && + (!memQueueFull())) { + schedule(nextAddrGenEvent, nextCycle()); + } } retrySpaceAllocated--; - if ((!nextAddrGenEvent.scheduled()) && - (!memReqQueueFull())) { - schedule(nextAddrGenEvent, nextCycle()); - } } void @@ -173,7 +161,7 @@ PushEngine::processNextAddrGenEvent() PushPacketInfoGen &curr_info = pushReqQueue.front(); std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo(); - DPRINTF(MPU, "%s: Current packet information generated by " + DPRINTF(PushEngine, "%s: Current packet information generated by " "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, " "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges); @@ -185,22 +173,22 @@ PushEngine::processNextAddrGenEvent() enqueueMemReq(pkt); if (curr_info.done()) { - DPRINTF(MPU, "%s: Current PushPacketInfoGen is done.\n", __func__); + DPRINTF(PushEngine, "%s: Current PushPacketInfoGen is done.\n", __func__); pushReqQueue.pop_front(); - DPRINTF(MPU, "%s: Popped curr_info from pushReqQueue. " + DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. " "pushReqQueue.size() = %u.\n", __func__, pushReqQueue.size()); if (numRetries > 0) { retrySpaceAllocated++; } if ((retrySpaceAllocated % numElementsPerLine) == 0) { - peerCoalesceEngine->respondToPushAlarm(); + peerCoalesceEngine->recvPushRetry(); } } - if (memReqQueueFull()) { + if (memQueueFull()) { if (!pushReqQueue.empty()) { - requestMemAlarm(1); + requestMemRetry(1); } return; } @@ -211,9 +199,10 @@ PushEngine::processNextAddrGenEvent() } void -PushEngine::respondToMemAlarm() +PushEngine::recvMemRetry() { assert(!nextAddrGenEvent.scheduled()); + DPRINTF(PushEngine, "%s: Responding to a memory alarm.\n", __func__); schedule(nextAddrGenEvent, nextCycle()); } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 4f388cd7e6..11122067d6 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -143,7 +143,7 @@ class PushEngine : public BaseMemEngine protected: virtual int respBuffSize() { return memRespQueue.size(); } - virtual void respondToMemAlarm(); + virtual void recvMemRetry(); virtual bool handleMemResp(PacketPtr pkt); public: diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 55a9147ac9..27ba5c40c8 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -139,7 +139,7 @@ WLEngine::processNextReadEvent() DPRINTF(MPU, "%s: Entry available in onTheFlyUpdateMap. " "onTheFlyUpdateMap.size: %lu.\n", __func__, onTheFlyUpdateMap.size()); - if (coalesceEngine->recvReadAddr(update_addr)) { + if (coalesceEngine->recvWLRead(update_addr)) { onTheFlyUpdateMap[update_addr] = update_value; DPRINTF(MPU, "%s: Added a new item to onTheFlyUpdateMap. " "onTheFlyUpdateMap[%lu] = %u.\n", __func__, From 86a72bc496be523600caf672cdd24c14ba484603 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 19 Jul 2022 14:33:22 -0700 Subject: [PATCH 112/247] Somewhat fixing the correctness. --- src/accl/graph/sega/coalesce_engine.cc | 97 +++++++++++++++++--------- src/accl/graph/sega/push_engine.cc | 3 +- 2 files changed, 65 insertions(+), 35 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 66b8e1fad7..274d85a5b1 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -117,6 +117,7 @@ CoalesceEngine::registerWLEngine(WLEngine* wl_engine) int CoalesceEngine::getBlockIndex(Addr addr) { + assert((addr % peerMemoryAtomSize) == 0); return ((int) (addr / peerMemoryAtomSize)) % numLines; } @@ -124,6 +125,7 @@ CoalesceEngine::getBlockIndex(Addr addr) int CoalesceEngine::getBitIndexBase(Addr addr) { + assert((addr % peerMemoryAtomSize) == 0); int atom_index = (int) (addr / (peerMemoryAtomSize * nmpu)); int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem)); int bit_index = atom_index * block_bits; @@ -134,6 +136,7 @@ CoalesceEngine::getBitIndexBase(Addr addr) Addr CoalesceEngine::getBlockAddrFromBitIndex(int index) { + assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0); Addr block_addr = (nmpu * peerMemoryAtomSize) * ((int)(index / (peerMemoryAtomSize / sizeof(WorkListItem)))); return (block_addr + memoryAddressOffset); @@ -336,39 +339,62 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) int it = getBitIndexBase(addr); int block_index = getBlockIndex(addr); - bool line_do_push = false; - if (cacheBlocks[block_index].addr == addr) { + if ((cacheBlocks[block_index].addr == addr) && + (cacheBlocks[block_index].valid)) { + // We read the address to send the wl but it is put in cache before + // the read response arrives. if (cacheBlocks[block_index].busyMask == 0) { - assert(applyQueue.find(block_index)); - line_do_push = true; + // It is not busy anymore, we have to send the wl from cache. + for (int i = 0; i < numElementsPerLine; i++) { + assert(!((needsPush[it + i] == 1) && + (cacheBlocks[block_index].items[i].degree == 0))); + // TODO: Make this more programmable + uint32_t new_prop = std::min( + cacheBlocks[block_index].items[i].prop, + cacheBlocks[block_index].items[i].tempProp); + cacheBlocks[block_index].items[i].tempProp = new_prop; + cacheBlocks[block_index].items[i].prop = new_prop; + peerPushEngine->recvWLItemRetry( + cacheBlocks[block_index].items[i], needsPush[it + i]); + needsPush[it + i] = 0; + } + // Since we have just applied the line, we can take it out of + // the applyQueue if it's in there. No need to do the same + // thing for evictQueue. + if (applyQueue.find(block_index)) { + applyQueue.erase(block_index); + if (applyQueue.empty() && nextApplyEvent.scheduled()) { + deschedule(nextApplyEvent); + } + } } else { - line_do_push = false; + // The line is busy. Therefore, we have to disregard the data + // we received from the memory and also tell the push engine to + // deallocate the space it allocated for this retry. However, + // we still have to rememeber that these items need a retry. + // i.e. don't change needsPush, call recvWLItemRetry with + // do_push = false + for (int i = 0; i < numElementsPerLine; i++) { + assert(!((needsPush[it + i] == 1) && + (cacheBlocks[block_index].items[i].degree == 0))); + peerPushEngine->recvWLItemRetry( + cacheBlocks[block_index].items[i], false); + } + } + } else { + // We have read the address to send the wl and it is not in the + // cache. Simply send the items to the PushEngine. + WorkListItem* items = pkt->getPtr(); + // No applying of the line needed. + for (int i = 0; i < numElementsPerLine; i++) { + assert(!((needsPush[it + i] == 1) && + (items[i].degree == 0))); + peerPushEngine->recvWLItemRetry(items[i], needsPush[it + i]); + needsPush[it + i] = 0; } - } - // We have to send the items regardless of them being found in the - // cache. However, if they are found in the cache, two things should - // happen. First, do_push should be set to false and the bit vector - // value for the items should not change. To future Mahyar and Marjan, - // If this is confusing, please look at where each item is pushed to - // the apply queue. Hint: Think about updates that might not be sent - // out if you reset the bit regardless of the line being found in the - // cache. - WorkListItem* items = pkt->getPtr(); - for (int i = 0; i < numElementsPerLine; i++) { - assert(!((needsPush[it + i] == 1) && (items[i].degree == 0))); - // TODO: Make this more programmable - uint32_t new_prop = std::min( - cacheBlocks[block_index].items[i].prop, - cacheBlocks[block_index].items[i].tempProp); - cacheBlocks[block_index].items[i].tempProp = new_prop; - cacheBlocks[block_index].items[i].prop = new_prop; - peerPushEngine->recvWLItemRetry(items[i], - (line_do_push && needsPush[it + i])); } - if (applyQueue.find(block_index)) { - applyQueue.erase(block_index); - } + delete pkt; return true; } @@ -488,9 +514,9 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) void CoalesceEngine::processNextApplyEvent() { - if (applyQueue.empty()) { - return; - } + // if (applyQueue.empty()) { + // return; + // } int block_index = applyQueue.front(); @@ -515,10 +541,12 @@ CoalesceEngine::processNextApplyEvent() DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n", __func__, cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)), cacheBlocks[block_index].items[i].to_string()); - int bit_index = getBitIndexBase(cacheBlocks[block_index].addr) + i; - if (cacheBlocks[block_index].items[i].degree != 0) { + if ((cacheBlocks[block_index].items[i].degree != 0) && + (needsPush[bit_index] == 0)) { + // If the respective bit in the bit vector is set + // there is no need to try and resend it. if (peerPushEngine->allocatePushSpace()) { peerPushEngine->recvWLItem( cacheBlocks[block_index].items[i]); @@ -684,6 +712,9 @@ CoalesceEngine::recvPushRetry() } if (applyQueue.find(block_index)) { applyQueue.erase(block_index); + if (applyQueue.empty() && nextApplyEvent.scheduled()) { + deschedule(nextApplyEvent); + } } } else { // FIXME: Fix the retry mechanism between memory and cache to diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 16e0ca6c6c..044429f8fc 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -97,12 +97,11 @@ PushEngine::ReqPort::recvReqRetry() sendPacket(blockedPacket); if (!_blocked) { + blockedPacket = nullptr; DPRINTF(MPU, "%s: Sent the blockedPacket. " "_blocked: %s, (blockedPacket == nullptr): %s.\n", __func__, _blocked ? "true" : "false", (blockedPacket == nullptr) ? "true" : "false"); - - blockedPacket = nullptr; } } From 9f4c1f31be4bf999b1b525e604999d529f33e41b Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 20 Jul 2022 01:31:49 -0700 Subject: [PATCH 113/247] Almost fixed retry bugs. 14 wrong vertices in lj. --- configs/accl/sega.py | 2 +- src/accl/graph/base/base_mem_engine.cc | 18 ++--- src/accl/graph/sega/coalesce_engine.cc | 95 ++++++++++++++++++----- src/accl/graph/sega/coalesce_engine.hh | 5 ++ src/accl/graph/sega/push_engine.cc | 101 +++++++++++++++++++------ src/accl/graph/sega/push_engine.hh | 4 +- 6 files changed, 170 insertions(+), 55 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 65645b3bb3..eb209911be 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -174,5 +174,5 @@ def get_inputs(): m5.instantiate() exit_event = m5.simulate() - print("Simulation finished!") + print(f"Exited simulation because {exit_event.getCause()}") exit() diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc index 32c314033d..e05357950b 100644 --- a/src/accl/graph/base/base_mem_engine.cc +++ b/src/accl/graph/base/base_mem_engine.cc @@ -97,12 +97,8 @@ BaseMemEngine::MemPort::recvReqRetry() void BaseMemEngine::processNextMemReqEvent() { - if (memPort.blocked()) { - return; - } - - if (((respBuffSize() + onTheFlyReqs) < respQueueSize) || - (respQueueSize == 0)) { + if ((respQueueSize == 0) || + ((respBuffSize() + onTheFlyReqs) < respQueueSize)) { PacketPtr pkt = outstandingMemReqQueue.front(); memPort.sendPacket(pkt); onTheFlyReqs++; @@ -120,7 +116,8 @@ BaseMemEngine::processNextMemReqEvent() } } - if ((!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) { + if ((!memPort.blocked()) && + (!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) { schedule(nextMemReqEvent, nextCycle()); } } @@ -183,8 +180,7 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt) panic_if(memQueueFull(), "Should not enqueue if queue full.\n"); outstandingMemReqQueue.push_back(pkt); - assert(!outstandingMemReqQueue.empty()); - if (!nextMemReqEvent.scheduled()) { + if ((!nextMemReqEvent.scheduled()) && (!memPort.blocked())) { schedule(nextMemReqEvent, nextCycle()); } } @@ -202,8 +198,8 @@ BaseMemEngine::requestMemRetry(int space) { void BaseMemEngine::wakeUp() { - if ((!nextMemReqEvent.scheduled()) && - (!outstandingMemReqQueue.empty())) { + assert(!nextMemReqEvent.scheduled()); + if (!outstandingMemReqQueue.empty()) { schedule(nextMemReqEvent, nextCycle()); } } diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 274d85a5b1..dde6e46aa9 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -31,6 +31,7 @@ #include "accl/graph/sega/wl_engine.hh" #include "base/intmath.hh" #include "debug/ApplyUpdates.hh" +#include "debug/CoalesceEngine.hh" #include "debug/MPU.hh" #include "mem/packet_access.hh" @@ -44,11 +45,14 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), numMSHREntry(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), + currentBitSliceIndex(0), + numRetriesReceived(0), applyQueue(numLines), evictQueue(numLines), nextRespondEvent([this] { processNextRespondEvent(); }, name()), nextApplyEvent([this] { processNextApplyEvent(); }, name()), nextEvictEvent([this] { processNextEvictEvent(); }, name()), + nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()), stats(*this) { assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine)); @@ -344,6 +348,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) // We read the address to send the wl but it is put in cache before // the read response arrives. if (cacheBlocks[block_index].busyMask == 0) { + DPRINTF(CoalesceEngine, "%s: Received read response for retry " + "for addr %lu. It was found in the cache as idle.\n", + __func__, addr); + int push_needed = 0; // It is not busy anymore, we have to send the wl from cache. for (int i = 0; i < numElementsPerLine; i++) { assert(!((needsPush[it + i] == 1) && @@ -354,10 +362,15 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) cacheBlocks[block_index].items[i].tempProp); cacheBlocks[block_index].items[i].tempProp = new_prop; cacheBlocks[block_index].items[i].prop = new_prop; - peerPushEngine->recvWLItemRetry( - cacheBlocks[block_index].items[i], needsPush[it + i]); + if (needsPush[it + i] == 1) { + peerPushEngine->recvWLItemRetry( + cacheBlocks[block_index].items[i]); + } + push_needed += needsPush[it + i]; needsPush[it + i] = 0; } + peerPushEngine->deallocatePushSpace( + numElementsPerLine - push_needed); // Since we have just applied the line, we can take it out of // the applyQueue if it's in there. No need to do the same // thing for evictQueue. @@ -366,6 +379,13 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) if (applyQueue.empty() && nextApplyEvent.scheduled()) { deschedule(nextApplyEvent); } + if (cacheBlocks[block_index].hasConflict) { + evictQueue.push_back(block_index); + if ((!nextEvictEvent.scheduled()) && + (!pendingMemRetry())) { + schedule(nextEvictEvent, nextCycle()); + } + } } } else { // The line is busy. Therefore, we have to disregard the data @@ -374,24 +394,31 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) // we still have to rememeber that these items need a retry. // i.e. don't change needsPush, call recvWLItemRetry with // do_push = false - for (int i = 0; i < numElementsPerLine; i++) { - assert(!((needsPush[it + i] == 1) && - (cacheBlocks[block_index].items[i].degree == 0))); - peerPushEngine->recvWLItemRetry( - cacheBlocks[block_index].items[i], false); - } + DPRINTF(CoalesceEngine, "%s: Received read response for retry " + "for addr %lu. It was found in the cache as busy.\n", + __func__, addr); + peerPushEngine->deallocatePushSpace(numElementsPerLine); } } else { // We have read the address to send the wl and it is not in the // cache. Simply send the items to the PushEngine. + DPRINTF(CoalesceEngine, "%s: Received read response for retry " + "for addr %lu. It was not found in the cache.\n", + __func__, addr); WorkListItem* items = pkt->getPtr(); + int push_needed = 0; // No applying of the line needed. for (int i = 0; i < numElementsPerLine; i++) { assert(!((needsPush[it + i] == 1) && (items[i].degree == 0))); - peerPushEngine->recvWLItemRetry(items[i], needsPush[it + i]); + if (needsPush[it + i] == 1) { + peerPushEngine->recvWLItemRetry(items[i]); + } + push_needed += needsPush[it + i]; needsPush[it + i] = 0; } + peerPushEngine->deallocatePushSpace( + numElementsPerLine - push_needed); } delete pkt; @@ -514,10 +541,6 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) void CoalesceEngine::processNextApplyEvent() { - // if (applyQueue.empty()) { - // return; - // } - int block_index = applyQueue.front(); if (cacheBlocks[block_index].busyMask) { @@ -665,14 +688,23 @@ CoalesceEngine::processNextEvictEvent() void CoalesceEngine::recvPushRetry() { - DPRINTF(MPU, "%s: Received a Push alarm.\n", __func__); + numRetriesReceived++; + if (!nextSendRetryEvent.scheduled()) { + schedule(nextSendRetryEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextSendRetryEvent() +{ + DPRINTF(MPU, "%s: Received a push retry.\n", __func__); Addr block_addr = 0; int block_index = 0; int it = 0; uint32_t slice = 0; bool hit_in_cache = false; - for (it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) { + for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) { for (int i = 0; i < numElementsPerLine; i++) { slice <<= 1; slice |= needsPush[it + i]; @@ -691,14 +723,23 @@ CoalesceEngine::recvPushRetry() break; } } + if (it == (MAX_BITVECTOR_SIZE - numElementsPerLine)) { + it = 0; + } } assert(it < MAX_BITVECTOR_SIZE); + if ((it + numElementsPerLine) > MAX_BITVECTOR_SIZE) { + currentBitSliceIndex = 0; + } else { + currentBitSliceIndex = it + numElementsPerLine; + } - DPRINTF(MPU, "%s: Found slice %u at %d position in needsPush.\n", - __func__, slice, it); + DPRINTF(CoalesceEngine, "%s: Found slice with value %d at position %d " + "in needsPush.\n", __func__, slice, it); if (hit_in_cache) { + int push_needed = 0; for (int i = 0; i < numElementsPerLine; i++) { // TODO: Make this more programmable uint32_t new_prop = std::min( @@ -706,15 +747,26 @@ CoalesceEngine::recvPushRetry() cacheBlocks[block_index].items[i].tempProp); cacheBlocks[block_index].items[i].tempProp = new_prop; cacheBlocks[block_index].items[i].prop = new_prop; - peerPushEngine->recvWLItemRetry(cacheBlocks[block_index].items[i], - (needsPush[it + i] == 1)); + if (needsPush[it + i] == 1) { + peerPushEngine->recvWLItemRetry( + cacheBlocks[block_index].items[i]); + } + push_needed += needsPush[it + i]; needsPush[it + i] = 0; } + peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed); if (applyQueue.find(block_index)) { applyQueue.erase(block_index); if (applyQueue.empty() && nextApplyEvent.scheduled()) { deschedule(nextApplyEvent); } + if (cacheBlocks[block_index].hasConflict) { + evictQueue.push_back(block_index); + if ((!nextEvictEvent.scheduled()) && + (!pendingMemRetry())) { + schedule(nextEvictEvent, nextCycle()); + } + } } } else { // FIXME: Fix the retry mechanism between memory and cache to @@ -730,6 +782,11 @@ CoalesceEngine::recvPushRetry() requestMemRetry(1); } } + + numRetriesReceived--; + if ((numRetriesReceived > 0) && (!nextSendRetryEvent.scheduled())) { + schedule(nextSendRetryEvent, nextCycle()); + } } CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 0fa555c84a..e1033a4622 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -96,6 +96,8 @@ class CoalesceEngine : public BaseMemEngine std::deque> responseQueue; + int currentBitSliceIndex; + int numRetriesReceived; FIFOSet applyQueue; std::bitset needsPush; @@ -114,6 +116,9 @@ class CoalesceEngine : public BaseMemEngine EventFunctionWrapper nextEvictEvent; void processNextEvictEvent(); + EventFunctionWrapper nextSendRetryEvent; + void processNextSendRetryEvent(); + struct CoalesceStats : public statistics::Group { CoalesceStats(CoalesceEngine &coalesce); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 044429f8fc..d493b34c53 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -105,6 +105,35 @@ PushEngine::ReqPort::recvReqRetry() } } +void +PushEngine::deallocatePushSpace(int space) +{ + retrySpaceAllocated -= space; + DPRINTF(PushEngine, "%s: Deallocated %d spaces. numRetries = %d, " + "nextAddrGenEvent.scheduled() = %s, pendingMemRetry() = %s, " + "pushReqQueue.size() = %d, retrySpaceAllocated = %d.\n", + __func__, space, numRetries, + nextAddrGenEvent.scheduled() ? "true" : "false", + pendingMemRetry() ? "true" : "false", + pushReqQueue.size(), retrySpaceAllocated); + /// DISCUSS: Might have to check whether the addrGenEvent is scheduled + // and or the pushReqQueue is empty. If so we might need to + // send retries. + if ((numRetries > 0) && + ((pushReqQueue.size() + retrySpaceAllocated) == 0)) { + assert((!pendingMemRetry()) && (!nextAddrGenEvent.scheduled())); + int free_space = + pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated); + if (free_space > numElementsPerLine) { + DPRINTF(PushEngine, "%s: Found %d free spaces. " + "retrySpaceAllocated = %d.\n", __func__, free_space, + retrySpaceAllocated); + retrySpaceAllocated += numElementsPerLine; + peerCoalesceEngine->recvPushRetry(); + } + } +} + void PushEngine::recvWLItem(WorkListItem wl) { @@ -124,32 +153,41 @@ PushEngine::recvWLItem(WorkListItem wl) pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), peerMemoryAtomSize, value); - assert(!pushReqQueue.empty()); - if ((!nextAddrGenEvent.scheduled()) && - (!memQueueFull())) { - schedule(nextAddrGenEvent, nextCycle()); + if ((!nextAddrGenEvent.scheduled())) { + if (memQueueFull()) { + if (!pendingMemRetry()) { + requestMemRetry(1); + } + } else { + schedule(nextAddrGenEvent, nextCycle()); + } } } void -PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push) +PushEngine::recvWLItemRetry(WorkListItem wl) { - DPRINTF(PushEngine, "%s: Received %s with do_push = %s.\n", - __func__, wl.to_string(), do_push ? "true" : "false"); - if (do_push) { - Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge)); - Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); - uint32_t value = wl.prop; - assert(wl.degree != 0); - pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), - peerMemoryAtomSize, value); - numRetries--; - if ((!nextAddrGenEvent.scheduled()) && - (!memQueueFull())) { + assert(wl.degree != 0); + DPRINTF(PushEngine, "%s: Received %s with retry.\n", + __func__, wl.to_string()); + + Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge)); + Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); + uint32_t value = wl.prop; + + pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), + peerMemoryAtomSize, value); + numRetries--; + retrySpaceAllocated--; + if ((!nextAddrGenEvent.scheduled())) { + if (memQueueFull()) { + if (!pendingMemRetry()) { + requestMemRetry(1); + } + } else { schedule(nextAddrGenEvent, nextCycle()); } } - retrySpaceAllocated--; } void @@ -177,11 +215,27 @@ PushEngine::processNextAddrGenEvent() DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. " "pushReqQueue.size() = %u.\n", __func__, pushReqQueue.size()); + // if ((numRetries > 0) && + // ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) { + // retrySpaceAllocated++; + // DPRINTF(PushEngine, "%s: Allocated 1 space for retry. " + // "retrySpaceAllocated = %d.\n", + // __func__, retrySpaceAllocated); + // if ((retrySpaceAllocated % numElementsPerLine) == 0) { + // peerCoalesceEngine->recvPushRetry(); + // } + // } if (numRetries > 0) { - retrySpaceAllocated++; - } - if ((retrySpaceAllocated % numElementsPerLine) == 0) { - peerCoalesceEngine->recvPushRetry(); + int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated); + DPRINTF(PushEngine, "%s: Found %d free spaces in " + "the pushReqQueue.\n", __func__, free_space); + if (free_space > numElementsPerLine) { + retrySpaceAllocated += numElementsPerLine; + DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. " + "retrySpaceAllocated = %d.\n", __func__, free_space, + retrySpaceAllocated); + peerCoalesceEngine->recvPushRetry(); + } } } @@ -201,7 +255,7 @@ void PushEngine::recvMemRetry() { assert(!nextAddrGenEvent.scheduled()); - DPRINTF(PushEngine, "%s: Responding to a memory alarm.\n", __func__); + DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__); schedule(nextAddrGenEvent, nextCycle()); } @@ -285,6 +339,7 @@ PushEngine::createUpdatePacket(Addr addr, T value) bool PushEngine::allocatePushSpace() { + assert(retrySpaceAllocated >= 0); if ((pushReqQueueSize == 0) || ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) { return true; diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 11122067d6..9025ae9946 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -155,9 +155,11 @@ class PushEngine : public BaseMemEngine bool allocatePushSpace(); + void deallocatePushSpace(int space); + void recvWLItem(WorkListItem wl); - void recvWLItemRetry(WorkListItem wl, bool do_push); + void recvWLItemRetry(WorkListItem wl); void registerCoalesceEngine(CoalesceEngine* coalesce_engine, int elements_per_line); From e54f3c1c05a637cea9d8385253edd25fdd7e0b78 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 20 Jul 2022 11:36:14 -0700 Subject: [PATCH 114/247] Deleting comments and updating config. --- configs/accl/sega.py | 14 +++++++------- src/accl/graph/sega/push_engine.cc | 14 ++------------ 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index eb209911be..15431088d2 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -9,20 +9,20 @@ class MPU(SubSystem): def __init__(self, base_edge_addr): super(MPU, self).__init__() self.push_engine = PushEngine(base_edge_addr=base_edge_addr, - push_req_queue_size=4, + push_req_queue_size=32, attached_memory_atom_size=64, - outstanding_mem_req_queue_size=1, - resp_queue_size=1) + outstanding_mem_req_queue_size=64, + resp_queue_size=64) self.coalesce_engine = CoalesceEngine( peer_push_engine=self.push_engine, attached_memory_atom_size=32, cache_size="1MiB", - num_mshr_entry=1, - num_tgts_per_mshr=1, + num_mshr_entry=32, + num_tgts_per_mshr=4, outstanding_mem_req_queue_size=0) self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, - update_queue_size=1, - on_the_fly_update_map_size=1) + update_queue_size=64, + on_the_fly_update_map_size=16) def getRespPort(self): return self.wl_engine.resp_port diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index d493b34c53..e87f4d275e 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -215,16 +215,6 @@ PushEngine::processNextAddrGenEvent() DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. " "pushReqQueue.size() = %u.\n", __func__, pushReqQueue.size()); - // if ((numRetries > 0) && - // ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) { - // retrySpaceAllocated++; - // DPRINTF(PushEngine, "%s: Allocated 1 space for retry. " - // "retrySpaceAllocated = %d.\n", - // __func__, retrySpaceAllocated); - // if ((retrySpaceAllocated % numElementsPerLine) == 0) { - // peerCoalesceEngine->recvPushRetry(); - // } - // } if (numRetries > 0) { int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated); DPRINTF(PushEngine, "%s: Found %d free spaces in " @@ -232,8 +222,8 @@ PushEngine::processNextAddrGenEvent() if (free_space > numElementsPerLine) { retrySpaceAllocated += numElementsPerLine; DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. " - "retrySpaceAllocated = %d.\n", __func__, free_space, - retrySpaceAllocated); + "retrySpaceAllocated = %d.\n", __func__, + numElementsPerLine, retrySpaceAllocated); peerCoalesceEngine->recvPushRetry(); } } From 5a27472b412574e5f3d02f2be34af319c9e70296 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 20 Jul 2022 14:12:33 -0700 Subject: [PATCH 115/247] Adding a new debug print. --- src/accl/graph/sega/coalesce_engine.cc | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index dde6e46aa9..e7e528aaf5 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -353,6 +353,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) __func__, addr); int push_needed = 0; // It is not busy anymore, we have to send the wl from cache. + DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", + __func__, needsPush.count()); for (int i = 0; i < numElementsPerLine; i++) { assert(!((needsPush[it + i] == 1) && (cacheBlocks[block_index].items[i].degree == 0))); @@ -369,6 +371,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) push_needed += needsPush[it + i]; needsPush[it + i] = 0; } + DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", + __func__, needsPush.count()); peerPushEngine->deallocatePushSpace( numElementsPerLine - push_needed); // Since we have just applied the line, we can take it out of @@ -397,7 +401,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) DPRINTF(CoalesceEngine, "%s: Received read response for retry " "for addr %lu. It was found in the cache as busy.\n", __func__, addr); + DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", + __func__, needsPush.count()); peerPushEngine->deallocatePushSpace(numElementsPerLine); + DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", + __func__, needsPush.count()); } } else { // We have read the address to send the wl and it is not in the @@ -408,6 +416,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) WorkListItem* items = pkt->getPtr(); int push_needed = 0; // No applying of the line needed. + DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", + __func__, needsPush.count()); for (int i = 0; i < numElementsPerLine; i++) { assert(!((needsPush[it + i] == 1) && (items[i].degree == 0))); @@ -417,6 +427,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) push_needed += needsPush[it + i]; needsPush[it + i] = 0; } + DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", + __func__, needsPush.count()); peerPushEngine->deallocatePushSpace( numElementsPerLine - push_needed); } @@ -740,6 +752,8 @@ CoalesceEngine::processNextSendRetryEvent() if (hit_in_cache) { int push_needed = 0; + DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", + __func__, needsPush.count()); for (int i = 0; i < numElementsPerLine; i++) { // TODO: Make this more programmable uint32_t new_prop = std::min( @@ -754,6 +768,8 @@ CoalesceEngine::processNextSendRetryEvent() push_needed += needsPush[it + i]; needsPush[it + i] = 0; } + DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", + __func__, needsPush.count()); peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed); if (applyQueue.find(block_index)) { applyQueue.erase(block_index); From 590c8a8870a475383faf26890c014a85bd9068ec Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 20 Jul 2022 15:51:58 -0700 Subject: [PATCH 116/247] Updating debug flags. Adding one per comp. --- configs/accl/sega.py | 14 ++-- src/accl/graph/SConscript | 4 +- src/accl/graph/base/SConscript | 1 + src/accl/graph/base/base_mem_engine.cc | 6 +- src/accl/graph/sega/coalesce_engine.cc | 91 +++++++++++++------------- src/accl/graph/sega/push_engine.cc | 9 ++- src/accl/graph/sega/wl_engine.cc | 44 ++++++------- 7 files changed, 82 insertions(+), 87 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 15431088d2..eb209911be 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -9,20 +9,20 @@ class MPU(SubSystem): def __init__(self, base_edge_addr): super(MPU, self).__init__() self.push_engine = PushEngine(base_edge_addr=base_edge_addr, - push_req_queue_size=32, + push_req_queue_size=4, attached_memory_atom_size=64, - outstanding_mem_req_queue_size=64, - resp_queue_size=64) + outstanding_mem_req_queue_size=1, + resp_queue_size=1) self.coalesce_engine = CoalesceEngine( peer_push_engine=self.push_engine, attached_memory_atom_size=32, cache_size="1MiB", - num_mshr_entry=32, - num_tgts_per_mshr=4, + num_mshr_entry=1, + num_tgts_per_mshr=1, outstanding_mem_req_queue_size=0) self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, - update_queue_size=64, - on_the_fly_update_map_size=16) + update_queue_size=1, + on_the_fly_update_map_size=1) def getRespPort(self): return self.wl_engine.resp_port diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript index 7ca60c30bd..f5f7e962af 100644 --- a/src/accl/graph/SConscript +++ b/src/accl/graph/SConscript @@ -27,5 +27,5 @@ Import('*') -DebugFlag('MPU') -# CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine']) + +CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine', 'BaseMemEngine']) diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript index 4c90dfa9a6..45877a12ca 100644 --- a/src/accl/graph/base/SConscript +++ b/src/accl/graph/base/SConscript @@ -32,3 +32,4 @@ SimObject('BaseReduceEngine.py') Source('base_mem_engine.cc') Source('base_reduce_engine.cc') +DebugFlag('BaseMemEngine') diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc index e05357950b..cb4c1d81bb 100644 --- a/src/accl/graph/base/base_mem_engine.cc +++ b/src/accl/graph/base/base_mem_engine.cc @@ -28,7 +28,7 @@ #include "accl/graph/base/base_mem_engine.hh" -#include "debug/MPU.hh" +#include "debug/BaseMemEngine.hh" namespace gem5 { @@ -102,7 +102,7 @@ BaseMemEngine::processNextMemReqEvent() PacketPtr pkt = outstandingMemReqQueue.front(); memPort.sendPacket(pkt); onTheFlyReqs++; - DPRINTF(MPU, "%s: Sent a packet to memory with the following info. " + DPRINTF(BaseMemEngine, "%s: Sent a packet to memory with the following info. " "pkt->addr: %lu, pkt->size: %lu.\n", __func__, pkt->getAddr(), pkt->getSize()); outstandingMemReqQueue.pop_front(); @@ -190,7 +190,7 @@ BaseMemEngine::requestMemRetry(int space) { panic_if((memRetryRequested == true) || (memSpaceRequested != 0), "You should not request another alarm without the first one being" "responded to.\n"); - DPRINTF(MPU, "%s: Alarm requested with space = %d.\n", __func__, space); + DPRINTF(BaseMemEngine, "%s: Alarm requested with space = %d.\n", __func__, space); memRetryRequested = true; memSpaceRequested = space; } diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index e7e528aaf5..522feebace 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -32,7 +32,6 @@ #include "base/intmath.hh" #include "debug/ApplyUpdates.hh" #include "debug/CoalesceEngine.hh" -#include "debug/MPU.hh" #include "mem/packet_access.hh" namespace gem5 @@ -150,7 +149,7 @@ bool CoalesceEngine::recvWLRead(Addr addr) { assert(MSHRMap.size() <= numMSHREntry); - DPRINTF(MPU, "%s: Received a read request for address: %lu.\n", + DPRINTF(CoalesceEngine, "%s: Received a read request for address: %lu.\n", __func__, addr); Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize; assert(aligned_addr % peerMemoryAtomSize == 0); @@ -167,7 +166,7 @@ CoalesceEngine::recvWLRead(Addr addr) // the future. responseQueue.push_back(std::make_tuple(addr, cacheBlocks[block_index].items[wl_offset])); - DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s " + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s " "to responseQueue. responseQueue.size = %d.\n", __func__, addr, block_index, wl_offset, cacheBlocks[block_index].items[wl_offset].to_string(), @@ -184,28 +183,28 @@ CoalesceEngine::recvWLRead(Addr addr) return true; } else { // miss - DPRINTF(MPU, "%s: Addr: %lu is a miss.\n", __func__, addr); + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a miss.\n", __func__, addr); if (MSHRMap.find(block_index) == MSHRMap.end()) { - DPRINTF(MPU, "%s: Respective cache line[%d] for Addr: %lu not " + DPRINTF(CoalesceEngine, "%s: Respective cache line[%d] for Addr: %lu not " "found in MSHRs.\n", __func__, block_index, addr); assert(MSHRMap.size() <= numMSHREntry); if (MSHRMap.size() == numMSHREntry) { // Out of MSHR entries - DPRINTF(MPU, "%s: Out of MSHR entries. " + DPRINTF(CoalesceEngine, "%s: Out of MSHR entries. " "Rejecting request.\n", __func__); // TODO: Break out read rejections into more than one stat // based on the cause of the rejection stats.readRejections++; return false; } else { - DPRINTF(MPU, "%s: MSHR entries available.\n", __func__); + DPRINTF(CoalesceEngine, "%s: MSHR entries available.\n", __func__); if (cacheBlocks[block_index].allocated) { assert(MSHRMap[block_index].size() <= numTgtsPerMSHR); - DPRINTF(MPU, "%s: Addr: %lu has a conflict " + DPRINTF(CoalesceEngine, "%s: Addr: %lu has a conflict " "with Addr: %lu.\n", __func__, addr, cacheBlocks[block_index].addr); if (MSHRMap[block_index].size() == numTgtsPerMSHR) { - DPRINTF(MPU, "%s: Out of targets for cache line[%d]. " + DPRINTF(CoalesceEngine, "%s: Out of targets for cache line[%d]. " "Rejecting request.\n", __func__, block_index); stats.readRejections++; @@ -213,13 +212,13 @@ CoalesceEngine::recvWLRead(Addr addr) } cacheBlocks[block_index].hasConflict = true; MSHRMap[block_index].push_back(addr); - DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache " + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets for cache " "line[%d].\n", __func__, addr, block_index); stats.readMisses++; stats.numVertexReads++; if (!cacheBlocks[block_index].busyMask) { applyQueue.push_back(block_index); - DPRINTF(MPU, "%s: Added %d to applyQueue. " + DPRINTF(CoalesceEngine, "%s: Added %d to applyQueue. " "applyQueue.size = %u.\n", __func__, block_index, applyQueue.size()); assert(!applyQueue.empty()); @@ -231,11 +230,11 @@ CoalesceEngine::recvWLRead(Addr addr) } else { assert(!cacheBlocks[block_index].valid); // MSHR available and no conflict - DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to " + DPRINTF(CoalesceEngine, "%s: Addr: %lu has no conflict. Trying to " "allocate a cache line for it.\n", __func__, addr); if (memQueueFull()) { - DPRINTF(MPU, "%s: No space in outstandingMemReqQueue. " + DPRINTF(CoalesceEngine, "%s: No space in outstandingMemReqQueue. " "Rejecting request.\n", __func__); stats.readRejections++; return false; @@ -245,19 +244,19 @@ CoalesceEngine::recvWLRead(Addr addr) cacheBlocks[block_index].allocated = true; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = false; - DPRINTF(MPU, "%s: Allocated cache line[%d] for " + DPRINTF(CoalesceEngine, "%s: Allocated cache line[%d] for " "Addr: %lu.\n", __func__, block_index, addr); MSHRMap[block_index].push_back(addr); - DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache " + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets for cache " "line[%d].\n", __func__, addr, block_index); PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize); - DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." + DPRINTF(CoalesceEngine, "%s: Created a read packet for Addr: %lu." " req addr (aligned_addr) = %lu, size = %d.\n", __func__, addr, aligned_addr, peerMemoryAtomSize); enqueueMemReq(pkt); - DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue.\n", + DPRINTF(CoalesceEngine, "%s: Pushed pkt to outstandingMemReqQueue.\n", __func__); stats.readMisses++; stats.numVertexReads++; @@ -265,10 +264,10 @@ CoalesceEngine::recvWLRead(Addr addr) } } } else { - DPRINTF(MPU, "%s: Respective cache line[%d] for Addr: %lu already " + DPRINTF(CoalesceEngine, "%s: Respective cache line[%d] for Addr: %lu already " "in MSHRs.\n", __func__, block_index, addr); if (MSHRMap[block_index].size() == numTgtsPerMSHR) { - DPRINTF(MPU, "%s: Out of targets for cache line[%d]. " + DPRINTF(CoalesceEngine, "%s: Out of targets for cache line[%d]. " "Rejecting request.\n", __func__, block_index); stats.readRejections++; @@ -276,7 +275,7 @@ CoalesceEngine::recvWLRead(Addr addr) } if ((!cacheBlocks[block_index].hasConflict) && (aligned_addr != cacheBlocks[block_index].addr)) { - DPRINTF(MPU, "%s: Addr: %lu has a conflict " + DPRINTF(CoalesceEngine, "%s: Addr: %lu has a conflict " "with Addr: %lu.\n", __func__, addr, cacheBlocks[block_index].addr); cacheBlocks[block_index].hasConflict = true; @@ -289,7 +288,7 @@ CoalesceEngine::recvWLRead(Addr addr) } MSHRMap[block_index].push_back(addr); - DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache " + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets for cache " "line[%d].\n", __func__, addr, block_index); stats.numVertexReads++; return true; @@ -306,11 +305,11 @@ CoalesceEngine::processNextRespondEvent() std::tie(addr_response, worklist_response) = responseQueue.front(); peerWLEngine->handleIncomingWL(addr_response, worklist_response); - DPRINTF(MPU, "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n", + DPRINTF(CoalesceEngine, "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n", __func__, worklist_response.to_string(), addr_response); responseQueue.pop_front(); - DPRINTF(MPU, "%s: Popped a response from responseQueue. " + DPRINTF(CoalesceEngine, "%s: Popped a response from responseQueue. " "responseQueue.size = %d.\n", __func__, responseQueue.size()); @@ -333,7 +332,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) assert(pkt->isResponse()); if (pkt->isWrite()) { delete pkt; - DPRINTF(MPU, "%s: Received a write response for Addr: %lu. Dropping " + DPRINTF(CoalesceEngine, "%s: Received a write response for Addr: %lu. Dropping " "the packet.\n", __func__, pkt->getAddr()); return true; } @@ -440,7 +439,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) Addr addr = pkt->getAddr(); int block_index = (addr / peerMemoryAtomSize) % numLines; - DPRINTF(MPU, "%s: Received a read resposne for Addr: %lu.\n", + DPRINTF(CoalesceEngine, "%s: Received a read resposne for Addr: %lu.\n", __func__, pkt->getAddr()); assert((cacheBlocks[block_index].allocated) && // allocated cache block (!cacheBlocks[block_index].valid) && // valid is false @@ -449,7 +448,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) peerMemoryAtomSize); for (int i = 0; i < numElementsPerLine; i++) { - DPRINTF(MPU, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__, + DPRINTF(CoalesceEngine, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__, block_index, i, cacheBlocks[block_index].items[i].to_string()); } cacheBlocks[block_index].valid = true; @@ -462,13 +461,13 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) Addr aligned_miss_addr = roundDown(miss_addr, peerMemoryAtomSize); if (aligned_miss_addr == addr) { int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); - DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could " + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for cache line[%d] could " "be serviced with the received packet.\n", __func__, miss_addr, block_index); // TODO: Make this block of code into a function responseQueue.push_back(std::make_tuple(miss_addr, cacheBlocks[block_index].items[wl_offset])); - DPRINTF(MPU, "%s: Pushed cache line[%d][%d] to " + DPRINTF(CoalesceEngine, "%s: Pushed cache line[%d][%d] to " "responseQueue. responseQueue.size = %u.\n" , __func__, block_index, wl_offset, responseQueue.size()); @@ -477,7 +476,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) // End of the said block servicedIndices.push_back(i); - DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for " + DPRINTF(CoalesceEngine, "%s: Added index: %d of MSHR for cache line[%d] for " "removal.\n", __func__, i, block_index); } } @@ -490,7 +489,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) MSHRMap[block_index].erase(MSHRMap[block_index].begin() + servicedIndices[i] - bias); bias++; - DPRINTF(MPU, "%s: Addr: %lu has been serviced and is removed.\n", + DPRINTF(CoalesceEngine, "%s: Addr: %lu has been serviced and is removed.\n", __func__, print_addr); } @@ -517,7 +516,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) int block_index = (aligned_addr / peerMemoryAtomSize) % numLines; int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); - DPRINTF(MPU, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n", + DPRINTF(CoalesceEngine, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n", __func__, wl.to_string(), addr); assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) == (1 << wl_offset)); @@ -529,17 +528,17 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) cacheBlocks[block_index].items[wl_offset] = wl; cacheBlocks[block_index].busyMask &= ~(1 << wl_offset); - DPRINTF(MPU, "%s: Wrote to cache line[%d][%d] = %s.\n", + DPRINTF(CoalesceEngine, "%s: Wrote to cache line[%d][%d] = %s.\n", __func__, block_index, wl_offset, cacheBlocks[block_index].items[wl_offset].to_string()); // TODO: Make this more general and programmable. if ((cacheBlocks[block_index].busyMask == 0)) { - DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]." + DPRINTF(CoalesceEngine, "%s: Received all the expected writes for cache line[%d]." " It does not have any taken items anymore.\n", __func__, block_index); applyQueue.push_back(block_index); - DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n", + DPRINTF(CoalesceEngine, "%s: Added %d to applyQueue. applyQueue.size = %u.\n", __func__, block_index, applyQueue.size()); } @@ -556,12 +555,12 @@ CoalesceEngine::processNextApplyEvent() int block_index = applyQueue.front(); if (cacheBlocks[block_index].busyMask) { - DPRINTF(MPU, "%s: cache line [%d] has been taken amid apply process. " + DPRINTF(CoalesceEngine, "%s: cache line [%d] has been taken amid apply process. " "Therefore, ignoring the apply schedule.\n", __func__, block_index); stats.falseApplySchedules++; } else if (!cacheBlocks[block_index].dirty) { - DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply " + DPRINTF(CoalesceEngine, "%s: cache line [%d] has no change. Therefore, no apply " "needed.\n", __func__, block_index); } else { for (int i = 0; i < numElementsPerLine; i++) { @@ -596,7 +595,7 @@ CoalesceEngine::processNextApplyEvent() // TODO: This is where eviction policy goes if (cacheBlocks[block_index].hasConflict){ evictQueue.push_back(block_index); - DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n", + DPRINTF(CoalesceEngine, "%s: Added %d to evictQueue. evictQueue.size = %u.\n", __func__, block_index, evictQueue.size()); } @@ -621,7 +620,7 @@ CoalesceEngine::processNextEvictEvent() if ((cacheBlocks[block_index].busyMask) || (applyQueue.find(block_index))) { - DPRINTF(MPU, "%s: cache line [%d] has been taken amid evict process. " + DPRINTF(CoalesceEngine, "%s: cache line [%d] has been taken amid evict process. " "Therefore, ignoring the apply schedule.\n", __func__, block_index); stats.falseEvictSchedules++; @@ -630,7 +629,7 @@ CoalesceEngine::processNextEvictEvent() (cacheBlocks[block_index].hasConflict ? 2 : 1) : (cacheBlocks[block_index].hasConflict ? 1 : 0); if (!allocateMemQueueSpace(space_needed)) { - DPRINTF(MPU, "%s: There is not enough space in memReqQueue to " + DPRINTF(CoalesceEngine, "%s: There is not enough space in memReqQueue to " "procees the eviction of cache line [%d]. dirty: %d, " "hasConflict: %d.\n", __func__, block_index, cacheBlocks[block_index].dirty, @@ -639,12 +638,12 @@ CoalesceEngine::processNextEvictEvent() return; } else { if (cacheBlocks[block_index].dirty) { - DPRINTF(MPU, "%s: Change observed on cache line [%d].\n", + DPRINTF(CoalesceEngine, "%s: Change observed on cache line [%d].\n", __func__, block_index); PacketPtr write_pkt = createWritePacket( cacheBlocks[block_index].addr, peerMemoryAtomSize, (uint8_t*) cacheBlocks[block_index].items); - DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, " + DPRINTF(CoalesceEngine, "%s: Created a write packet to Addr: %lu, " "size = %d.\n", __func__, write_pkt->getAddr(), write_pkt->getSize()); enqueueMemReq(write_pkt); @@ -653,7 +652,7 @@ CoalesceEngine::processNextEvictEvent() if (cacheBlocks[block_index].hasConflict) { assert(!MSHRMap[block_index].empty()); Addr miss_addr = MSHRMap[block_index].front(); - DPRINTF(MPU, "%s: First conflicting address for cache line[%d]" + DPRINTF(CoalesceEngine, "%s: First conflicting address for cache line[%d]" " is Addr: %lu.\n", __func__, block_index, miss_addr); Addr aligned_miss_addr = @@ -661,7 +660,7 @@ CoalesceEngine::processNextEvictEvent() PacketPtr read_pkt = createReadPacket(aligned_miss_addr, peerMemoryAtomSize); - DPRINTF(MPU, "%s: Created a read packet for Addr: %lu." + DPRINTF(CoalesceEngine, "%s: Created a read packet for Addr: %lu." " req addr (aligned_addr) = %lu, size = %d.\n", __func__, miss_addr, read_pkt->getAddr(), read_pkt->getSize()); @@ -673,7 +672,7 @@ CoalesceEngine::processNextEvictEvent() cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = true; cacheBlocks[block_index].dirty = false; - DPRINTF(MPU, "%s: Allocated cache line [%d] for Addr: %lu.\n", + DPRINTF(CoalesceEngine, "%s: Allocated cache line [%d] for Addr: %lu.\n", __func__, block_index, aligned_miss_addr); } else { @@ -683,7 +682,7 @@ CoalesceEngine::processNextEvictEvent() cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = false; cacheBlocks[block_index].dirty = false; - DPRINTF(MPU, "%s: Deallocated cache line [%d].\n", + DPRINTF(CoalesceEngine, "%s: Deallocated cache line [%d].\n", __func__, block_index); } } @@ -709,7 +708,7 @@ CoalesceEngine::recvPushRetry() void CoalesceEngine::processNextSendRetryEvent() { - DPRINTF(MPU, "%s: Received a push retry.\n", __func__); + DPRINTF(CoalesceEngine, "%s: Received a push retry.\n", __func__); Addr block_addr = 0; int block_index = 0; int it = 0; diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index e87f4d275e..f17619942b 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -29,7 +29,6 @@ #include "accl/graph/sega/push_engine.hh" #include "accl/graph/sega/coalesce_engine.hh" -#include "debug/MPU.hh" #include "debug/PushEngine.hh" #include "mem/packet_access.hh" @@ -91,14 +90,14 @@ PushEngine::ReqPort::recvReqRetry() { panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); - DPRINTF(MPU, "%s: Received a reqRetry.\n", __func__); + DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__); _blocked = false; sendPacket(blockedPacket); if (!_blocked) { blockedPacket = nullptr; - DPRINTF(MPU, "%s: Sent the blockedPacket. " + DPRINTF(PushEngine, "%s: Sent the blockedPacket. " "_blocked: %s, (blockedPacket == nullptr): %s.\n", __func__, _blocked ? "true" : "false", (blockedPacket == nullptr) ? "true" : "false"); @@ -273,7 +272,7 @@ PushEngine::processNextPushEvent() assert(offset < peerMemoryAtomSize); uint32_t value = reqValueMap[pkt->req]; - DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu, " + DPRINTF(PushEngine, "%s: Looking at the front of the queue. pkt->Addr: %lu, " "offset: %lu\n", __func__, pkt->getAddr(), offset); @@ -287,7 +286,7 @@ PushEngine::processNextPushEvent() if (!reqPort.blocked()) { reqPort.sendPacket(update); stats.numUpdates++; - DPRINTF(MPU, "%s: Sent a push update to addr: %lu with value: %d.\n", + DPRINTF(PushEngine, "%s: Sent a push update to addr: %lu with value: %d.\n", __func__, curr_edge->neighbor, update_value); reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge); assert(reqOffsetMap[pkt->req] <= peerMemoryAtomSize); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 27ba5c40c8..9d4fb9cbe9 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -28,7 +28,7 @@ #include "accl/graph/sega/wl_engine.hh" -#include "debug/MPU.hh" +#include "debug/WLEngine.hh" #include "mem/packet_access.hh" namespace gem5 @@ -73,7 +73,7 @@ void WLEngine::RespPort::checkRetryReq() { if (needSendRetryReq) { - DPRINTF(MPU, "%s: Sending a RetryReq.\n", __func__); + DPRINTF(WLEngine, "%s: Sending a RetryReq.\n", __func__); sendRetryReq(); needSendRetryReq = false; } @@ -129,45 +129,38 @@ WLEngine::processNextReadEvent() uint32_t update_value; std::tie(update_addr, update_value) = updateQueue.front(); - DPRINTF(MPU, "%s: Looking at the front of the updateQueue. Addr: %lu, " + DPRINTF(WLEngine, "%s: Looking at the front of the updateQueue. Addr: %lu, " "value: %u.\n", __func__, update_addr, update_value); if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) { - DPRINTF(MPU, "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n", + DPRINTF(WLEngine, "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n", __func__, update_addr); if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) { - DPRINTF(MPU, "%s: Entry available in onTheFlyUpdateMap. " - "onTheFlyUpdateMap.size: %lu.\n", - __func__, onTheFlyUpdateMap.size()); - if (coalesceEngine->recvWLRead(update_addr)) { + if (coalesceEngine->recvReadAddr(update_addr)) { onTheFlyUpdateMap[update_addr] = update_value; - DPRINTF(MPU, "%s: Added a new item to onTheFlyUpdateMap. " + DPRINTF(WLEngine, "%s: Added a new item to onTheFlyUpdateMap. " "onTheFlyUpdateMap[%lu] = %u.\n", __func__, update_addr, onTheFlyUpdateMap[update_addr]); updateQueue.pop_front(); - DPRINTF(MPU, "%s: Popped an item from the front of updateQueue" + DPRINTF(WLEngine, "%s: Popped an item from the front of updateQueue" ". updateQueue.size = %u.\n", __func__, updateQueue.size()); respPort.checkRetryReq(); } - } else { - DPRINTF(MPU, "%s: No entries available in onTheFlyUpdateMap. " - "onTheFlyUpdateMap.size: %lu.\n", __func__, - onTheFlyUpdateMap.size()); } } else { // TODO: Generalize this to reduce function rather than just min - DPRINTF(MPU, "%s: Found the addr: %lu in onTheFlyUpdateMap. " + DPRINTF(WLEngine, "%s: Found the addr: %lu in onTheFlyUpdateMap. " "onTheFlyUpdateMap[%lu] = %u.\n", __func__, update_addr, update_addr, onTheFlyUpdateMap[update_addr]); onTheFlyUpdateMap[update_addr] = std::min(update_value, onTheFlyUpdateMap[update_addr]); - DPRINTF(MPU, "%s: Reduced the update_value with the entry in " + DPRINTF(WLEngine, "%s: Reduced the update_value with the entry in " "onTheFlyUpdateMap. onTheFlyUpdateMap[%lu] = %u.\n", __func__, update_addr, onTheFlyUpdateMap[update_addr]); stats.onTheFlyCoalesce++; updateQueue.pop_front(); - DPRINTF(MPU, "%s: Popped an item from the front of updateQueue" + DPRINTF(WLEngine, "%s: Popped an item from the front of updateQueue" ". updateQueue.size = %u.\n", __func__, updateQueue.size()); respPort.checkRetryReq(); @@ -185,7 +178,7 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl) assert(addrWorkListMap.size() <= onTheFlyUpdateMapSize); addrWorkListMap[addr] = wl; - DPRINTF(MPU, "%s: Received a WorkListItem from the coalesceEngine. Adding" + DPRINTF(WLEngine, "%s: Received a WorkListItem from the coalesceEngine. Adding" " it to the addrWorkListMap. addrWorkListMap[%lu] = %s.\n", __func__, addr, wl.to_string()); @@ -202,7 +195,7 @@ WLEngine::processNextReduceEvent() Addr addr = it.first; assert(onTheFlyUpdateMap.find(addr) != onTheFlyUpdateMap.end()); uint32_t update_value = onTheFlyUpdateMap[addr]; - DPRINTF(MPU, "%s: Reducing between onTheFlyUpdateMap and " + DPRINTF(WLEngine, "%s: Reducing between onTheFlyUpdateMap and " "addrWorkListMap values. onTheFlyUpdateMap[%lu] = %u, " "addrWorkListMap[%lu] = %s.\n", __func__, addr, onTheFlyUpdateMap[addr], @@ -210,15 +203,14 @@ WLEngine::processNextReduceEvent() // TODO: Generalize this to reduce function rather than just min addrWorkListMap[addr].tempProp = std::min(update_value, addrWorkListMap[addr].tempProp); - DPRINTF(MPU, "%s: Reduction done. addrWorkListMap[%lu] = %s.\n", + DPRINTF(WLEngine, "%s: Reduction done. addrWorkListMap[%lu] = %s.\n", __func__, addr, addrWorkListMap[addr].to_string()); stats.numReduce++; coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]); onTheFlyUpdateMap.erase(addr); - DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap. " - "onTheFlyUpdateMap.size: %lu.\n", - __func__, addr, onTheFlyUpdateMap.size()); + DPRINTF(WLEngine, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n", + __func__, addr); } addrWorkListMap.clear(); } @@ -231,8 +223,12 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt) return false; } + if (curTick() == ) { + std + } + updateQueue.emplace_back(pkt->getAddr(), pkt->getLE()); - DPRINTF(MPU, "%s: Pushed an item to the back of updateQueue" + DPRINTF(WLEngine, "%s: Pushed an item to the back of updateQueue" ". updateQueue.size = %u.\n", __func__, updateQueue.size()); delete pkt; From be1246d97085c07ab86fc888111b9cdb8b6b30ea Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 20 Jul 2022 16:11:12 -0700 Subject: [PATCH 117/247] Removing accidentally commented out wrong code. --- src/accl/graph/sega/wl_engine.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 9d4fb9cbe9..70a921c48a 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -223,10 +223,6 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt) return false; } - if (curTick() == ) { - std - } - updateQueue.emplace_back(pkt->getAddr(), pkt->getLE()); DPRINTF(WLEngine, "%s: Pushed an item to the back of updateQueue" ". updateQueue.size = %u.\n", From c9458f184ad39f8f147bb18a9f3e29f2ecb90ec1 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 21 Jul 2022 14:23:35 -0700 Subject: [PATCH 118/247] Adding in between counter for retry. --- src/accl/graph/sega/push_engine.cc | 59 +++++++++++++++++++++--------- src/accl/graph/sega/push_engine.hh | 5 ++- src/accl/graph/sega/wl_engine.cc | 2 +- 3 files changed, 45 insertions(+), 21 deletions(-) diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index f17619942b..0c2b3deb3f 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -37,11 +37,10 @@ namespace gem5 PushEngine::PushEngine(const PushEngineParams ¶ms): BaseMemEngine(params), - retrySpaceAllocated(0), reqPort(name() + ".req_port", this), baseEdgeAddr(params.base_edge_addr), - numRetries(0), pushReqQueueSize(params.push_req_queue_size), + numRetries(0), retrySpaceAllocated(0), spacesAllocatedBetweenRetries(0), nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()), nextPushEvent([this] { processNextPushEvent(); }, name()), stats(*this) @@ -118,16 +117,28 @@ PushEngine::deallocatePushSpace(int space) /// DISCUSS: Might have to check whether the addrGenEvent is scheduled // and or the pushReqQueue is empty. If so we might need to // send retries. - if ((numRetries > 0) && - ((pushReqQueue.size() + retrySpaceAllocated) == 0)) { - assert((!pendingMemRetry()) && (!nextAddrGenEvent.scheduled())); + // if ((numRetries > 0) && + // ((pushReqQueue.size() + retrySpaceAllocated) == 0)) { + // assert((!pendingMemRetry()) && (!nextAddrGenEvent.scheduled())); + // int free_space = + // pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated); + // if (free_space > numElementsPerLine) { + // DPRINTF(PushEngine, "%s: Found %d free spaces. " + // "retrySpaceAllocated = %d.\n", __func__, free_space, + // retrySpaceAllocated); + // retrySpaceAllocated += numElementsPerLine; + // peerCoalesceEngine->recvPushRetry(); + // } + // } + + if (numRetries > 0) { int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated); - if (free_space > numElementsPerLine) { - DPRINTF(PushEngine, "%s: Found %d free spaces. " - "retrySpaceAllocated = %d.\n", __func__, free_space, - retrySpaceAllocated); - retrySpaceAllocated += numElementsPerLine; + assert(free_space <= numElementsPerLine); + retrySpaceAllocated += free_space; + spacesAllocatedBetweenRetries += free_space; + if (spacesAllocatedBetweenRetries >= numElementsPerLine) { + spacesAllocatedBetweenRetries %= numElementsPerLine; peerCoalesceEngine->recvPushRetry(); } } @@ -214,15 +225,26 @@ PushEngine::processNextAddrGenEvent() DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. " "pushReqQueue.size() = %u.\n", __func__, pushReqQueue.size()); + // if (numRetries > 0) { + // int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated); + // DPRINTF(PushEngine, "%s: Found %d free spaces in " + // "the pushReqQueue.\n", __func__, free_space); + // if (free_space > numElementsPerLine) { + // retrySpaceAllocated += numElementsPerLine; + // DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. " + // "retrySpaceAllocated = %d.\n", __func__, + // numElementsPerLine, retrySpaceAllocated); + // peerCoalesceEngine->recvPushRetry(); + // } + // } + if (numRetries > 0) { - int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated); - DPRINTF(PushEngine, "%s: Found %d free spaces in " - "the pushReqQueue.\n", __func__, free_space); - if (free_space > numElementsPerLine) { - retrySpaceAllocated += numElementsPerLine; - DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. " - "retrySpaceAllocated = %d.\n", __func__, - numElementsPerLine, retrySpaceAllocated); + retrySpaceAllocated++; + DPRINTF(PushEngine, "%s: Allocated one space for retry. " + "retrySpaceAllocated = %d.\n", __func__, retrySpaceAllocated); + spacesAllocatedBetweenRetries++; + if (spacesAllocatedBetweenRetries == numElementsPerLine) { + spacesAllocatedBetweenRetries = 0; peerCoalesceEngine->recvPushRetry(); } } @@ -331,6 +353,7 @@ PushEngine::allocatePushSpace() { assert(retrySpaceAllocated >= 0); if ((pushReqQueueSize == 0) || ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) { + assert(numRetries == 0); return true; } else { numRetries++; diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 9025ae9946..cd79139bbc 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -99,15 +99,16 @@ class PushEngine : public BaseMemEngine }; int numElementsPerLine; - int retrySpaceAllocated; CoalesceEngine* peerCoalesceEngine; ReqPort reqPort; Addr baseEdgeAddr; - int numRetries; int pushReqQueueSize; + int numRetries; + int retrySpaceAllocated; + int spacesAllocatedBetweenRetries; std::deque pushReqQueue; // TODO: Add size one size for all these maps diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 70a921c48a..79bf046ba3 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -136,7 +136,7 @@ WLEngine::processNextReadEvent() DPRINTF(WLEngine, "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n", __func__, update_addr); if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) { - if (coalesceEngine->recvReadAddr(update_addr)) { + if (coalesceEngine->recvWLRead(update_addr)) { onTheFlyUpdateMap[update_addr] = update_value; DPRINTF(WLEngine, "%s: Added a new item to onTheFlyUpdateMap. " "onTheFlyUpdateMap[%lu] = %u.\n", __func__, From cb3169882f5dd404f87f533f104d1fa346da30f1 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 21 Jul 2022 23:24:32 -0700 Subject: [PATCH 119/247] Fixing the retry mechanism. --- src/accl/graph/sega/coalesce_engine.cc | 21 ++++-- src/accl/graph/sega/push_engine.cc | 89 +++++++++----------------- src/accl/graph/sega/push_engine.hh | 9 ++- 3 files changed, 55 insertions(+), 64 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 522feebace..b3167a0e95 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -354,6 +354,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) // It is not busy anymore, we have to send the wl from cache. DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", __func__, needsPush.count()); + assert(peerPushEngine->getNumRetries() == needsPush.count()); for (int i = 0; i < numElementsPerLine; i++) { assert(!((needsPush[it + i] == 1) && (cacheBlocks[block_index].items[i].degree == 0))); @@ -374,6 +375,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) __func__, needsPush.count()); peerPushEngine->deallocatePushSpace( numElementsPerLine - push_needed); + assert(peerPushEngine->getNumRetries() == needsPush.count()); // Since we have just applied the line, we can take it out of // the applyQueue if it's in there. No need to do the same // thing for evictQueue. @@ -402,7 +404,9 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) __func__, addr); DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", __func__, needsPush.count()); + assert(peerPushEngine->getNumRetries() == needsPush.count()); peerPushEngine->deallocatePushSpace(numElementsPerLine); + assert(peerPushEngine->getNumRetries() == needsPush.count()); DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", __func__, needsPush.count()); } @@ -417,6 +421,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) // No applying of the line needed. DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", __func__, needsPush.count()); + assert(peerPushEngine->getNumRetries() == needsPush.count()); for (int i = 0; i < numElementsPerLine; i++) { assert(!((needsPush[it + i] == 1) && (items[i].degree == 0))); @@ -430,6 +435,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) __func__, needsPush.count()); peerPushEngine->deallocatePushSpace( numElementsPerLine - push_needed); + assert(peerPushEngine->getNumRetries() == needsPush.count()); } delete pkt; @@ -708,6 +714,13 @@ CoalesceEngine::recvPushRetry() void CoalesceEngine::processNextSendRetryEvent() { + if (needsPush.count() == 0) { + DPRINTF(CoalesceEngine, "%s: Received a retry while there are no set " + "bit in needsPush. Rejecting the retry.\n", __func__); + peerPushEngine->recvRetryReject(); + return; + } + DPRINTF(CoalesceEngine, "%s: Received a push retry.\n", __func__); Addr block_addr = 0; int block_index = 0; @@ -715,7 +728,8 @@ CoalesceEngine::processNextSendRetryEvent() uint32_t slice = 0; bool hit_in_cache = false; - for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) { + for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE; + it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE) { for (int i = 0; i < numElementsPerLine; i++) { slice <<= 1; slice |= needsPush[it + i]; @@ -734,9 +748,6 @@ CoalesceEngine::processNextSendRetryEvent() break; } } - if (it == (MAX_BITVECTOR_SIZE - numElementsPerLine)) { - it = 0; - } } assert(it < MAX_BITVECTOR_SIZE); @@ -753,6 +764,7 @@ CoalesceEngine::processNextSendRetryEvent() int push_needed = 0; DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", __func__, needsPush.count()); + assert(peerPushEngine->getNumRetries() == needsPush.count()); for (int i = 0; i < numElementsPerLine; i++) { // TODO: Make this more programmable uint32_t new_prop = std::min( @@ -770,6 +782,7 @@ CoalesceEngine::processNextSendRetryEvent() DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", __func__, needsPush.count()); peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed); + assert(peerPushEngine->getNumRetries() == needsPush.count()); if (applyQueue.find(block_index)) { applyQueue.erase(block_index); if (applyQueue.empty() && nextApplyEvent.scheduled()) { diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 0c2b3deb3f..6db91734fe 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -40,7 +40,7 @@ PushEngine::PushEngine(const PushEngineParams ¶ms): reqPort(name() + ".req_port", this), baseEdgeAddr(params.base_edge_addr), pushReqQueueSize(params.push_req_queue_size), - numRetries(0), retrySpaceAllocated(0), spacesAllocatedBetweenRetries(0), + numTotalRetries(0), numPendingRetries(0), nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()), nextPushEvent([this] { processNextPushEvent(); }, name()), stats(*this) @@ -106,39 +106,22 @@ PushEngine::ReqPort::recvReqRetry() void PushEngine::deallocatePushSpace(int space) { - retrySpaceAllocated -= space; - DPRINTF(PushEngine, "%s: Deallocated %d spaces. numRetries = %d, " - "nextAddrGenEvent.scheduled() = %s, pendingMemRetry() = %s, " - "pushReqQueue.size() = %d, retrySpaceAllocated = %d.\n", - __func__, space, numRetries, - nextAddrGenEvent.scheduled() ? "true" : "false", - pendingMemRetry() ? "true" : "false", - pushReqQueue.size(), retrySpaceAllocated); /// DISCUSS: Might have to check whether the addrGenEvent is scheduled // and or the pushReqQueue is empty. If so we might need to // send retries. - // if ((numRetries > 0) && - // ((pushReqQueue.size() + retrySpaceAllocated) == 0)) { - // assert((!pendingMemRetry()) && (!nextAddrGenEvent.scheduled())); - // int free_space = - // pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated); - // if (free_space > numElementsPerLine) { - // DPRINTF(PushEngine, "%s: Found %d free spaces. " - // "retrySpaceAllocated = %d.\n", __func__, free_space, - // retrySpaceAllocated); - // retrySpaceAllocated += numElementsPerLine; - // peerCoalesceEngine->recvPushRetry(); - // } - // } - - if (numRetries > 0) { - int free_space = - pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated); - assert(free_space <= numElementsPerLine); - retrySpaceAllocated += free_space; - spacesAllocatedBetweenRetries += free_space; - if (spacesAllocatedBetweenRetries >= numElementsPerLine) { - spacesAllocatedBetweenRetries %= numElementsPerLine; + DPRINTF(PushEngine, "%s: Received reported %d free spaces.\n", + __func__, space); + numPendingRetries--; + if (numTotalRetries > 0) { + int free_space = pushReqQueueSize - + (pushReqQueue.size() + (numPendingRetries * numElementsPerLine)); + DPRINTF(PushEngine, "%s: pushReqQueue has at least %d " + "free spaces.\n", __func__, free_space); + if ((free_space > numElementsPerLine) && + (numTotalRetries >= numPendingRetries)) { + DPRINTF(PushEngine, "%s: Sent a push retry to " + "peerCoalesceEngine.\n", __func__); + numPendingRetries++; peerCoalesceEngine->recvPushRetry(); } } @@ -162,6 +145,8 @@ PushEngine::recvWLItem(WorkListItem wl) pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), peerMemoryAtomSize, value); + DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n", + __func__, pushReqQueue.size()); if ((!nextAddrGenEvent.scheduled())) { if (memQueueFull()) { @@ -187,8 +172,10 @@ PushEngine::recvWLItemRetry(WorkListItem wl) pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), peerMemoryAtomSize, value); - numRetries--; - retrySpaceAllocated--; + DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n", + __func__, pushReqQueue.size()); + + numTotalRetries--; if ((!nextAddrGenEvent.scheduled())) { if (memQueueFull()) { if (!pendingMemRetry()) { @@ -225,26 +212,16 @@ PushEngine::processNextAddrGenEvent() DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. " "pushReqQueue.size() = %u.\n", __func__, pushReqQueue.size()); - // if (numRetries > 0) { - // int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated); - // DPRINTF(PushEngine, "%s: Found %d free spaces in " - // "the pushReqQueue.\n", __func__, free_space); - // if (free_space > numElementsPerLine) { - // retrySpaceAllocated += numElementsPerLine; - // DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. " - // "retrySpaceAllocated = %d.\n", __func__, - // numElementsPerLine, retrySpaceAllocated); - // peerCoalesceEngine->recvPushRetry(); - // } - // } - - if (numRetries > 0) { - retrySpaceAllocated++; - DPRINTF(PushEngine, "%s: Allocated one space for retry. " - "retrySpaceAllocated = %d.\n", __func__, retrySpaceAllocated); - spacesAllocatedBetweenRetries++; - if (spacesAllocatedBetweenRetries == numElementsPerLine) { - spacesAllocatedBetweenRetries = 0; + if (numTotalRetries > 0) { + int free_space = pushReqQueueSize - + (pushReqQueue.size() + (numPendingRetries * numElementsPerLine)); + DPRINTF(PushEngine, "%s: pushReqQueue has at least %d" + "free spaces.\n", __func__, free_space); + if ((free_space > numElementsPerLine) && + (numTotalRetries >= numPendingRetries)) { + DPRINTF(PushEngine, "%s: Sent a push retry to " + "peerCoalesceEngine.\n", __func__); + numPendingRetries++; peerCoalesceEngine->recvPushRetry(); } } @@ -350,13 +327,11 @@ PushEngine::createUpdatePacket(Addr addr, T value) bool PushEngine::allocatePushSpace() { - assert(retrySpaceAllocated >= 0); if ((pushReqQueueSize == 0) || - ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) { - assert(numRetries == 0); + ((pushReqQueue.size() < pushReqQueueSize) && (numTotalRetries == 0))) { return true; } else { - numRetries++; + numTotalRetries++; return false; } } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index cd79139bbc..a3a308554f 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -106,9 +106,8 @@ class PushEngine : public BaseMemEngine Addr baseEdgeAddr; int pushReqQueueSize; - int numRetries; - int retrySpaceAllocated; - int spacesAllocatedBetweenRetries; + int numTotalRetries; + int numPendingRetries; std::deque pushReqQueue; // TODO: Add size one size for all these maps @@ -164,6 +163,10 @@ class PushEngine : public BaseMemEngine void registerCoalesceEngine(CoalesceEngine* coalesce_engine, int elements_per_line); + + int getNumRetries() { return numTotalRetries; } + + void recvRetryReject() { numPendingRetries--; } }; } From c03a23a38717d7dd123bb92b0a55bb048e53545f Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 23 Jul 2022 15:59:31 -0700 Subject: [PATCH 120/247] Limiting retries to one. --- src/accl/graph/sega/push_engine.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 6db91734fe..ab2962b253 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -118,7 +118,7 @@ PushEngine::deallocatePushSpace(int space) DPRINTF(PushEngine, "%s: pushReqQueue has at least %d " "free spaces.\n", __func__, free_space); if ((free_space > numElementsPerLine) && - (numTotalRetries >= numPendingRetries)) { + (numPendingRetries == 0)) { DPRINTF(PushEngine, "%s: Sent a push retry to " "peerCoalesceEngine.\n", __func__); numPendingRetries++; @@ -218,7 +218,7 @@ PushEngine::processNextAddrGenEvent() DPRINTF(PushEngine, "%s: pushReqQueue has at least %d" "free spaces.\n", __func__, free_space); if ((free_space > numElementsPerLine) && - (numTotalRetries >= numPendingRetries)) { + (numPendingRetries == 0)) { DPRINTF(PushEngine, "%s: Sent a push retry to " "peerCoalesceEngine.\n", __func__); numPendingRetries++; From dcfaab330d517c1b02c8aaa882336698d1a29de6 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 23 Jul 2022 17:28:51 -0700 Subject: [PATCH 121/247] Adding MemoryEvent class and nextReadOnMissEvent. --- src/accl/graph/sega/coalesce_engine.cc | 42 +++++++++++++++++++++----- src/accl/graph/sega/coalesce_engine.hh | 21 +++++++++++-- 2 files changed, 54 insertions(+), 9 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index b3167a0e95..033c1f3363 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -48,6 +48,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): numRetriesReceived(0), applyQueue(numLines), evictQueue(numLines), + nextReadOnMissEvent([this] { processNextReadOnMissEvent(); }, name()), nextRespondEvent([this] { processNextRespondEvent(); }, name()), nextApplyEvent([this] { processNextApplyEvent(); }, name()), nextEvictEvent([this] { processNextEvictEvent(); }, name()), @@ -175,7 +176,6 @@ CoalesceEngine::recvWLRead(Addr addr) cacheBlocks[block_index].busyMask |= (1 << wl_offset); stats.readHits++; - assert(!responseQueue.empty()); if (!nextRespondEvent.scheduled()) { schedule(nextRespondEvent, nextCycle()); } @@ -233,9 +233,9 @@ CoalesceEngine::recvWLRead(Addr addr) DPRINTF(CoalesceEngine, "%s: Addr: %lu has no conflict. Trying to " "allocate a cache line for it.\n", __func__, addr); - if (memQueueFull()) { - DPRINTF(CoalesceEngine, "%s: No space in outstandingMemReqQueue. " - "Rejecting request.\n", __func__); + if (lineFillBuffer.size() == numMSHREntry) { + DPRINTF(CoalesceEngine, "%s: No space left in " + "lineFillBuffer. Rejecting request.\n", __func__); stats.readRejections++; return false; } @@ -255,9 +255,15 @@ CoalesceEngine::recvWLRead(Addr addr) DPRINTF(CoalesceEngine, "%s: Created a read packet for Addr: %lu." " req addr (aligned_addr) = %lu, size = %d.\n", __func__, addr, aligned_addr, peerMemoryAtomSize); - enqueueMemReq(pkt); - DPRINTF(CoalesceEngine, "%s: Pushed pkt to outstandingMemReqQueue.\n", - __func__); + // enqueueMemReq(pkt); + lineFillBuffer.push_back(pkt); + DPRINTF(CoalesceEngine, "%s: Pushed pkt to " + "lineFillBuffer. lineFillBuffer.size = %d.\n", + __func__, lineFillBuffer.size()); + if ((!nextReadOnMissEvent.pending()) && + (!nextReadOnMissEvent.scheduled())) { + schedule(nextReadOnMissEvent, nextCycle()); + } stats.readMisses++; stats.numVertexReads++; return true; @@ -296,6 +302,28 @@ CoalesceEngine::recvWLRead(Addr addr) } } +void +CoalesceEngine::processNextReadOnMissEvent() +{ + if (memQueueFull()) { + nextReadOnMissEvent.sleep(); + // TODO: Implement interface where events of the CoalesceEngine are + // pushed to a fifo to be scheduled later. + return; + } + + PacketPtr pkt = lineFillBuffer.front(); + enqueueMemReq(pkt); + + lineFillBuffer.pop_front(); + + if (!lineFillBuffer.empty()) { + assert(!nextReadOnMissEvent.scheduled()); + assert(!nextReadOnMissEvent.pending()); + schedule(nextReadOnMissEvent, nextCycle()); + } +} + // TODO: For loop to empty the entire responseQueue. void CoalesceEngine::processNextRespondEvent() diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index e1033a4622..05fa555ec8 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -49,6 +49,20 @@ class WLEngine; class CoalesceEngine : public BaseMemEngine { private: + class MemoryEvent : public EventFunctionWrapper + { + private: + bool _pending; + public: + MemoryEvent(const std::function &callback, + const std::string &name): + EventFunctionWrapper(callback, name), _pending(false) + {} + bool pending() { return _pending; } + void sleep() { _pending = true; } + void wake() { _pending = false; } + }; + struct Block { WorkListItem* items; @@ -93,7 +107,7 @@ class CoalesceEngine : public BaseMemEngine int numMSHREntry; int numTgtsPerMSHR; std::unordered_map> MSHRMap; - + std::deque lineFillBuffer; std::deque> responseQueue; int currentBitSliceIndex; @@ -107,13 +121,16 @@ class CoalesceEngine : public BaseMemEngine int getBitIndexBase(Addr addr); Addr getBlockAddrFromBitIndex(int index); + MemoryEvent nextReadOnMissEvent; + void processNextReadOnMissEvent(); + EventFunctionWrapper nextRespondEvent; void processNextRespondEvent(); EventFunctionWrapper nextApplyEvent; void processNextApplyEvent(); - EventFunctionWrapper nextEvictEvent; + MemoryEvent nextEvictEvent; void processNextEvictEvent(); EventFunctionWrapper nextSendRetryEvent; From 7db47e2a89611412310f3f50e32df6433a429af4 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 23 Jul 2022 22:04:08 -0700 Subject: [PATCH 122/247] Restructuring events and adding nextWriteBackEvent. --- src/accl/graph/base/data_structs.hh | 4 +- src/accl/graph/sega/coalesce_engine.cc | 290 ++++++++++++------------- src/accl/graph/sega/coalesce_engine.hh | 21 +- src/accl/graph/sega/push_engine.cc | 4 +- 4 files changed, 153 insertions(+), 166 deletions(-) diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index f938be72f1..f178d5a7e2 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -90,13 +90,13 @@ static_assert(isPowerOf2(sizeof(WorkListItem))); static_assert(isPowerOf2(sizeof(Edge))); template -class FIFOSet +class InOutSet { private: std::unordered_set set; public: - FIFOSet(int cap) + InOutSet(int cap) { set.reserve(cap); } diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 033c1f3363..ddbd22a8b5 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -42,16 +42,17 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): peerPushEngine(params.peer_push_engine), numLines((int) (params.cache_size / peerMemoryAtomSize)), numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), - numMSHREntry(params.num_mshr_entry), + numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), currentBitSliceIndex(0), numRetriesReceived(0), applyQueue(numLines), - evictQueue(numLines), - nextReadOnMissEvent([this] { processNextReadOnMissEvent(); }, name()), + writeBackQueue(numLines), + replaceQueue(numLines), + nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()), nextRespondEvent([this] { processNextRespondEvent(); }, name()), nextApplyEvent([this] { processNextApplyEvent(); }, name()), - nextEvictEvent([this] { processNextEvictEvent(); }, name()), + nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()), nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()), stats(*this) { @@ -149,7 +150,7 @@ CoalesceEngine::getBlockAddrFromBitIndex(int index) bool CoalesceEngine::recvWLRead(Addr addr) { - assert(MSHRMap.size() <= numMSHREntry); + assert(MSHR.size() <= numMSHREntries); DPRINTF(CoalesceEngine, "%s: Received a read request for address: %lu.\n", __func__, addr); Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize; @@ -184,11 +185,11 @@ CoalesceEngine::recvWLRead(Addr addr) } else { // miss DPRINTF(CoalesceEngine, "%s: Addr: %lu is a miss.\n", __func__, addr); - if (MSHRMap.find(block_index) == MSHRMap.end()) { - DPRINTF(CoalesceEngine, "%s: Respective cache line[%d] for Addr: %lu not " + if (MSHR.find(block_index) == MSHR.end()) { + DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for Addr: %lu not " "found in MSHRs.\n", __func__, block_index, addr); - assert(MSHRMap.size() <= numMSHREntry); - if (MSHRMap.size() == numMSHREntry) { + assert(MSHR.size() <= numMSHREntries); + if (MSHR.size() == numMSHREntries) { // Out of MSHR entries DPRINTF(CoalesceEngine, "%s: Out of MSHR entries. " "Rejecting request.\n", __func__); @@ -199,24 +200,26 @@ CoalesceEngine::recvWLRead(Addr addr) } else { DPRINTF(CoalesceEngine, "%s: MSHR entries available.\n", __func__); if (cacheBlocks[block_index].allocated) { - assert(MSHRMap[block_index].size() <= numTgtsPerMSHR); + assert(MSHR[block_index].size() <= numTgtsPerMSHR); DPRINTF(CoalesceEngine, "%s: Addr: %lu has a conflict " "with Addr: %lu.\n", __func__, addr, cacheBlocks[block_index].addr); - if (MSHRMap[block_index].size() == numTgtsPerMSHR) { - DPRINTF(CoalesceEngine, "%s: Out of targets for cache line[%d]. " + if (MSHR[block_index].size() == numTgtsPerMSHR) { + DPRINTF(CoalesceEngine, "%s: Out of targets for cacheBlocks[%d]. " "Rejecting request.\n", __func__, block_index); stats.readRejections++; return false; } cacheBlocks[block_index].hasConflict = true; - MSHRMap[block_index].push_back(addr); + MSHR[block_index].push_back(addr); DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets for cache " "line[%d].\n", __func__, addr, block_index); stats.readMisses++; stats.numVertexReads++; - if (!cacheBlocks[block_index].busyMask) { + + if ((cacheBlocks[block_index].busyMask == 0) && + (cacheBlocks[block_index].valid)) { applyQueue.push_back(block_index); DPRINTF(CoalesceEngine, "%s: Added %d to applyQueue. " "applyQueue.size = %u.\n", __func__, @@ -230,39 +233,31 @@ CoalesceEngine::recvWLRead(Addr addr) } else { assert(!cacheBlocks[block_index].valid); // MSHR available and no conflict - DPRINTF(CoalesceEngine, "%s: Addr: %lu has no conflict. Trying to " - "allocate a cache line for it.\n", - __func__, addr); - if (lineFillBuffer.size() == numMSHREntry) { - DPRINTF(CoalesceEngine, "%s: No space left in " - "lineFillBuffer. Rejecting request.\n", __func__); - stats.readRejections++; - return false; - } + DPRINTF(CoalesceEngine, "%s: Addr: %lu has no conflict. " + "Allocating a cache line for it.\n" + , __func__, addr); + cacheBlocks[block_index].addr = aligned_addr; cacheBlocks[block_index].busyMask = 0; cacheBlocks[block_index].allocated = true; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = false; - DPRINTF(CoalesceEngine, "%s: Allocated cache line[%d] for " - "Addr: %lu.\n", __func__, block_index, addr); + DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for" + " Addr: %lu.\n", __func__, block_index, addr); - MSHRMap[block_index].push_back(addr); - DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets for cache " - "line[%d].\n", __func__, addr, block_index); + MSHR[block_index].push_back(addr); + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " + "for cacheBlocks[%d].\n", __func__, addr, block_index); - PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize); - DPRINTF(CoalesceEngine, "%s: Created a read packet for Addr: %lu." - " req addr (aligned_addr) = %lu, size = %d.\n", - __func__, addr, aligned_addr, peerMemoryAtomSize); // enqueueMemReq(pkt); - lineFillBuffer.push_back(pkt); - DPRINTF(CoalesceEngine, "%s: Pushed pkt to " - "lineFillBuffer. lineFillBuffer.size = %d.\n", - __func__, lineFillBuffer.size()); - if ((!nextReadOnMissEvent.pending()) && - (!nextReadOnMissEvent.scheduled())) { - schedule(nextReadOnMissEvent, nextCycle()); + fillQueue.push_back(block_index); + // FIXME: Fix this DPRINTF + // DPRINTF(CoalesceEngine, "%s: Pushed pkt index " + // "lineFillBuffer. lineFillBuffer.size = %d.\n", + // __func__, fillQueue.size()); + if ((!nextMemoryReadEvent.pending()) && + (!nextMemoryReadEvent.scheduled())) { + schedule(nextMemoryReadEvent, nextCycle()); } stats.readMisses++; stats.numVertexReads++; @@ -270,10 +265,10 @@ CoalesceEngine::recvWLRead(Addr addr) } } } else { - DPRINTF(CoalesceEngine, "%s: Respective cache line[%d] for Addr: %lu already " + DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for Addr: %lu already " "in MSHRs.\n", __func__, block_index, addr); - if (MSHRMap[block_index].size() == numTgtsPerMSHR) { - DPRINTF(CoalesceEngine, "%s: Out of targets for cache line[%d]. " + if (MSHR[block_index].size() == numTgtsPerMSHR) { + DPRINTF(CoalesceEngine, "%s: Out of targets for cacheBlocks[%d]. " "Rejecting request.\n", __func__, block_index); stats.readRejections++; @@ -293,7 +288,7 @@ CoalesceEngine::recvWLRead(Addr addr) stats.readHitUnderMisses++; } - MSHRMap[block_index].push_back(addr); + MSHR[block_index].push_back(addr); DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets for cache " "line[%d].\n", __func__, addr, block_index); stats.numVertexReads++; @@ -303,24 +298,29 @@ CoalesceEngine::recvWLRead(Addr addr) } void -CoalesceEngine::processNextReadOnMissEvent() +CoalesceEngine::processNextMemoryReadEvent() { if (memQueueFull()) { - nextReadOnMissEvent.sleep(); + nextMemoryReadEvent.sleep(); // TODO: Implement interface where events of the CoalesceEngine are // pushed to a fifo to be scheduled later. return; } - PacketPtr pkt = lineFillBuffer.front(); + int block_index = fillQueue.front(); + PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr, + peerMemoryAtomSize); + DPRINTF(CoalesceEngine, "%s: Created a read packet. addr = %lu, " + "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); + enqueueMemReq(pkt); - lineFillBuffer.pop_front(); + fillQueue.pop_front(); - if (!lineFillBuffer.empty()) { - assert(!nextReadOnMissEvent.scheduled()); - assert(!nextReadOnMissEvent.pending()); - schedule(nextReadOnMissEvent, nextCycle()); + if (!fillQueue.empty()) { + assert(!nextMemoryReadEvent.scheduled()); + assert(!nextMemoryReadEvent.pending()); + schedule(nextMemoryReadEvent, nextCycle()); } } @@ -347,11 +347,13 @@ CoalesceEngine::processNextRespondEvent() } } +// FIXME: Update this for implementing event retry interaction. void CoalesceEngine::recvMemRetry() { - assert(!nextEvictEvent.scheduled()); - schedule(nextEvictEvent, nextCycle()); + // assert(!nextEvictEvent.scheduled()); + // schedule(nextEvictEvent, nextCycle()); + return; } bool @@ -413,10 +415,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) deschedule(nextApplyEvent); } if (cacheBlocks[block_index].hasConflict) { - evictQueue.push_back(block_index); - if ((!nextEvictEvent.scheduled()) && - (!pendingMemRetry())) { - schedule(nextEvictEvent, nextCycle()); + writeBackQueue.push_back(block_index); + if ((!nextWriteBackEvent.pending()) && + (!nextWriteBackEvent.scheduled())) { + schedule(nextWriteBackEvent, nextCycle()); } } } @@ -477,7 +479,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) __func__, pkt->getAddr()); assert((cacheBlocks[block_index].allocated) && // allocated cache block (!cacheBlocks[block_index].valid) && // valid is false - (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR + (!(MSHR.find(block_index) == MSHR.end()))); // allocated MSHR pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); @@ -490,18 +492,18 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) // FIXME: Get rid of servicedIndices (maybe use an iterator) std::vector servicedIndices; - for (int i = 0; i < MSHRMap[block_index].size(); i++) { - Addr miss_addr = MSHRMap[block_index][i]; + for (int i = 0; i < MSHR[block_index].size(); i++) { + Addr miss_addr = MSHR[block_index][i]; Addr aligned_miss_addr = roundDown(miss_addr, peerMemoryAtomSize); if (aligned_miss_addr == addr) { int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); - DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for cache line[%d] could " + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for cacheBlocks[%d] could " "be serviced with the received packet.\n", __func__, miss_addr, block_index); // TODO: Make this block of code into a function responseQueue.push_back(std::make_tuple(miss_addr, cacheBlocks[block_index].items[wl_offset])); - DPRINTF(CoalesceEngine, "%s: Pushed cache line[%d][%d] to " + DPRINTF(CoalesceEngine, "%s: Pushed cacheBlocks[%d][%d] to " "responseQueue. responseQueue.size = %u.\n" , __func__, block_index, wl_offset, responseQueue.size()); @@ -510,25 +512,25 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) // End of the said block servicedIndices.push_back(i); - DPRINTF(CoalesceEngine, "%s: Added index: %d of MSHR for cache line[%d] for " + DPRINTF(CoalesceEngine, "%s: Added index: %d of MSHR for cacheBlocks[%d] for " "removal.\n", __func__, i, block_index); } } // TODO: We Can use taken instead of this - // TODO: Change the MSHRMap from map to map + // TODO: Change the MSHR from map to map int bias = 0; for (int i = 0; i < servicedIndices.size(); i++) { - Addr print_addr = MSHRMap[block_index][i - bias]; - MSHRMap[block_index].erase(MSHRMap[block_index].begin() + + Addr print_addr = MSHR[block_index][i - bias]; + MSHR[block_index].erase(MSHR[block_index].begin() + servicedIndices[i] - bias); bias++; DPRINTF(CoalesceEngine, "%s: Addr: %lu has been serviced and is removed.\n", __func__, print_addr); } - if (MSHRMap[block_index].empty()) { - MSHRMap.erase(block_index); + if (MSHR[block_index].empty()) { + MSHR.erase(block_index); cacheBlocks[block_index].hasConflict = false; } else { assert(cacheBlocks[block_index].hasConflict); @@ -562,13 +564,13 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) cacheBlocks[block_index].items[wl_offset] = wl; cacheBlocks[block_index].busyMask &= ~(1 << wl_offset); - DPRINTF(CoalesceEngine, "%s: Wrote to cache line[%d][%d] = %s.\n", + DPRINTF(CoalesceEngine, "%s: Wrote to cacheBlocks[%d][%d] = %s.\n", __func__, block_index, wl_offset, cacheBlocks[block_index].items[wl_offset].to_string()); // TODO: Make this more general and programmable. if ((cacheBlocks[block_index].busyMask == 0)) { - DPRINTF(CoalesceEngine, "%s: Received all the expected writes for cache line[%d]." + DPRINTF(CoalesceEngine, "%s: Received all the expected writes for cacheBlocks[%d]." " It does not have any taken items anymore.\n", __func__, block_index); applyQueue.push_back(block_index); @@ -588,13 +590,13 @@ CoalesceEngine::processNextApplyEvent() { int block_index = applyQueue.front(); - if (cacheBlocks[block_index].busyMask) { - DPRINTF(CoalesceEngine, "%s: cache line [%d] has been taken amid apply process. " + if (cacheBlocks[block_index].busyMask != 0) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been taken amid apply process. " "Therefore, ignoring the apply schedule.\n", __func__, block_index); stats.falseApplySchedules++; } else if (!cacheBlocks[block_index].dirty) { - DPRINTF(CoalesceEngine, "%s: cache line [%d] has no change. Therefore, no apply " + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has no change. Therefore, no apply " "needed.\n", __func__, block_index); } else { for (int i = 0; i < numElementsPerLine; i++) { @@ -628,17 +630,17 @@ CoalesceEngine::processNextApplyEvent() // TODO: This is where eviction policy goes if (cacheBlocks[block_index].hasConflict){ - evictQueue.push_back(block_index); - DPRINTF(CoalesceEngine, "%s: Added %d to evictQueue. evictQueue.size = %u.\n", - __func__, block_index, evictQueue.size()); + writeBackQueue.push_back(block_index); + DPRINTF(CoalesceEngine, "%s: Added %d to writeBackQueue. writeBackQueue.size = %u.\n", + __func__, block_index, writeBackQueue.size()); } applyQueue.pop_front(); - if ((!evictQueue.empty()) && - (!pendingMemRetry()) && - (!nextEvictEvent.scheduled())) { - schedule(nextEvictEvent, nextCycle()); + if ((!writeBackQueue.empty()) && + (!nextWriteBackEvent.pending()) && + (!nextWriteBackEvent.scheduled())) { + schedule(nextWriteBackEvent, nextCycle()); } if ((!applyQueue.empty()) && @@ -648,85 +650,64 @@ CoalesceEngine::processNextApplyEvent() } void -CoalesceEngine::processNextEvictEvent() +CoalesceEngine::processNextWriteBackEvent() { - int block_index = evictQueue.front(); + if (memQueueFull()) { + nextWriteBackEvent.sleep(); + // TODO: Implement interface where events of the CoalesceEngine are + // pushed to a fifo to be scheduled later. + return; + } - if ((cacheBlocks[block_index].busyMask) || + int block_index = writeBackQueue.front(); + + // Why would we write it back if it does not have a conflict? + assert(cacheBlocks[block_index].hasConflict); + + if ((cacheBlocks[block_index].busyMask != 0) || (applyQueue.find(block_index))) { - DPRINTF(CoalesceEngine, "%s: cache line [%d] has been taken amid evict process. " - "Therefore, ignoring the apply schedule.\n", + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been taken amid " + "writeback process. Therefore, ignoring the apply schedule.\n", __func__, block_index); + // FIXME: Fix the name of this stat. stats.falseEvictSchedules++; } else { - int space_needed = cacheBlocks[block_index].dirty ? - (cacheBlocks[block_index].hasConflict ? 2 : 1) : - (cacheBlocks[block_index].hasConflict ? 1 : 0); - if (!allocateMemQueueSpace(space_needed)) { - DPRINTF(CoalesceEngine, "%s: There is not enough space in memReqQueue to " - "procees the eviction of cache line [%d]. dirty: %d, " - "hasConflict: %d.\n", __func__, block_index, - cacheBlocks[block_index].dirty, - cacheBlocks[block_index].hasConflict); - requestMemRetry(space_needed); - return; - } else { - if (cacheBlocks[block_index].dirty) { - DPRINTF(CoalesceEngine, "%s: Change observed on cache line [%d].\n", - __func__, block_index); - PacketPtr write_pkt = createWritePacket( - cacheBlocks[block_index].addr, peerMemoryAtomSize, - (uint8_t*) cacheBlocks[block_index].items); - DPRINTF(CoalesceEngine, "%s: Created a write packet to Addr: %lu, " - "size = %d.\n", __func__, - write_pkt->getAddr(), write_pkt->getSize()); - enqueueMemReq(write_pkt); - } - - if (cacheBlocks[block_index].hasConflict) { - assert(!MSHRMap[block_index].empty()); - Addr miss_addr = MSHRMap[block_index].front(); - DPRINTF(CoalesceEngine, "%s: First conflicting address for cache line[%d]" - " is Addr: %lu.\n", __func__, block_index, miss_addr); - - Addr aligned_miss_addr = - roundDown(miss_addr, peerMemoryAtomSize); - - PacketPtr read_pkt = createReadPacket(aligned_miss_addr, - peerMemoryAtomSize); - DPRINTF(CoalesceEngine, "%s: Created a read packet for Addr: %lu." - " req addr (aligned_addr) = %lu, size = %d.\n", - __func__, miss_addr, - read_pkt->getAddr(), read_pkt->getSize()); - enqueueMemReq(read_pkt); - - cacheBlocks[block_index].addr = aligned_miss_addr; - cacheBlocks[block_index].busyMask = 0; - cacheBlocks[block_index].allocated = true; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].hasConflict = true; - cacheBlocks[block_index].dirty = false; - DPRINTF(CoalesceEngine, "%s: Allocated cache line [%d] for Addr: %lu.\n", - __func__, block_index, aligned_miss_addr); - } else { - - // Since allocated is false, does not matter what the address is. - cacheBlocks[block_index].busyMask = 0; - cacheBlocks[block_index].allocated = false; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].hasConflict = false; - cacheBlocks[block_index].dirty = false; - DPRINTF(CoalesceEngine, "%s: Deallocated cache line [%d].\n", - __func__, block_index); - } + if (cacheBlocks[block_index].dirty) { + DPRINTF(CoalesceEngine, "%s: Change observed on " + "cacheBlocks[%d].\n", __func__, block_index); + PacketPtr write_pkt = createWritePacket( + cacheBlocks[block_index].addr, peerMemoryAtomSize, + (uint8_t*) cacheBlocks[block_index].items); + DPRINTF(CoalesceEngine, "%s: Created a write packet to " + "Addr: %lu, size = %d.\n", __func__, + write_pkt->getAddr(), write_pkt->getSize()); + enqueueMemReq(write_pkt); } + assert(!MSHR[block_index].empty()); + Addr miss_addr = MSHR[block_index].front(); + DPRINTF(CoalesceEngine, "%s: First conflicting address for " + "cacheBlocks[%d] is Addr: %lu.\n", + __func__, block_index, miss_addr); + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + + cacheBlocks[block_index].addr = aligned_miss_addr; + cacheBlocks[block_index].busyMask = 0; + cacheBlocks[block_index].allocated = true; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].hasConflict = true; + cacheBlocks[block_index].dirty = false; + DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for " + "Addr: %lu.\n", __func__, block_index, aligned_miss_addr); + fillQueue.push_back(block_index); } - evictQueue.pop_front(); + writeBackQueue.pop_front(); - if ((!evictQueue.empty()) && - (!nextEvictEvent.scheduled())) { - schedule(nextEvictEvent, nextCycle()); + if (!writeBackQueue.empty()) { + assert(!nextWriteBackEvent.pending()); + assert(!nextWriteBackEvent.scheduled()); + schedule(nextWriteBackEvent, nextCycle()); } } @@ -817,10 +798,11 @@ CoalesceEngine::processNextSendRetryEvent() deschedule(nextApplyEvent); } if (cacheBlocks[block_index].hasConflict) { - evictQueue.push_back(block_index); - if ((!nextEvictEvent.scheduled()) && - (!pendingMemRetry())) { - schedule(nextEvictEvent, nextCycle()); + writeBackQueue.push_back(block_index); + if ((!writeBackQueue.empty()) && + (!nextWriteBackEvent.pending()) && + (!nextWriteBackEvent.scheduled())) { + schedule(nextWriteBackEvent, nextCycle()); } } } @@ -829,6 +811,8 @@ CoalesceEngine::processNextSendRetryEvent() // handle memory retries correctly. This probably requires scheduling // an event for sending the retry. For now we're enabling infinite // queueing in the outstandingMemReqQueue. + // FIXME: Also do not send requests for cache lines that are already + // read but await data. Just set a flag or sth. PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize); SenderState* sender_state = new SenderState(true); pkt->pushSenderState(sender_state); diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 05fa555ec8..563fa671b3 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -104,25 +104,28 @@ class CoalesceEngine : public BaseMemEngine int numLines; int numElementsPerLine; - int numMSHREntry; + int numMSHREntries; int numTgtsPerMSHR; - std::unordered_map> MSHRMap; - std::deque lineFillBuffer; + std::unordered_map> MSHR; + + std::deque fillQueue; + std::deque> responseQueue; int currentBitSliceIndex; int numRetriesReceived; - FIFOSet applyQueue; + InOutSet applyQueue; std::bitset needsPush; - FIFOSet evictQueue; + InOutSet writeBackQueue; + InOutSet replaceQueue; int getBlockIndex(Addr addr); int getBitIndexBase(Addr addr); Addr getBlockAddrFromBitIndex(int index); - MemoryEvent nextReadOnMissEvent; - void processNextReadOnMissEvent(); + MemoryEvent nextMemoryReadEvent; + void processNextMemoryReadEvent(); EventFunctionWrapper nextRespondEvent; void processNextRespondEvent(); @@ -130,8 +133,8 @@ class CoalesceEngine : public BaseMemEngine EventFunctionWrapper nextApplyEvent; void processNextApplyEvent(); - MemoryEvent nextEvictEvent; - void processNextEvictEvent(); + MemoryEvent nextWriteBackEvent; + void processNextWriteBackEvent(); EventFunctionWrapper nextSendRetryEvent; void processNextSendRetryEvent(); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index ab2962b253..5ab8db401c 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -117,7 +117,7 @@ PushEngine::deallocatePushSpace(int space) (pushReqQueue.size() + (numPendingRetries * numElementsPerLine)); DPRINTF(PushEngine, "%s: pushReqQueue has at least %d " "free spaces.\n", __func__, free_space); - if ((free_space > numElementsPerLine) && + if ((free_space >= numElementsPerLine) && (numPendingRetries == 0)) { DPRINTF(PushEngine, "%s: Sent a push retry to " "peerCoalesceEngine.\n", __func__); @@ -217,7 +217,7 @@ PushEngine::processNextAddrGenEvent() (pushReqQueue.size() + (numPendingRetries * numElementsPerLine)); DPRINTF(PushEngine, "%s: pushReqQueue has at least %d" "free spaces.\n", __func__, free_space); - if ((free_space > numElementsPerLine) && + if ((free_space >= numElementsPerLine) && (numPendingRetries == 0)) { DPRINTF(PushEngine, "%s: Sent a push retry to " "peerCoalesceEngine.\n", __func__); From e0f5242c06f12b799b76455d0b95ba90e6238e74 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 23 Jul 2022 23:57:58 -0700 Subject: [PATCH 123/247] Implemented MemoryEvent retry mechanism. --- configs/accl/sega.py | 2 +- src/accl/graph/sega/coalesce_engine.cc | 87 ++++++++++++++++++++------ src/accl/graph/sega/coalesce_engine.hh | 5 +- src/accl/graph/sega/push_engine.cc | 17 +++-- src/accl/graph/sega/push_engine.hh | 3 + 5 files changed, 88 insertions(+), 26 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index eb209911be..ffd74241e7 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -19,7 +19,7 @@ def __init__(self, base_edge_addr): cache_size="1MiB", num_mshr_entry=1, num_tgts_per_mshr=1, - outstanding_mem_req_queue_size=0) + outstanding_mem_req_queue_size=1) self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, update_queue_size=1, on_the_fly_update_map_size=1) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index ddbd22a8b5..4a0600e9c0 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -300,10 +300,16 @@ CoalesceEngine::recvWLRead(Addr addr) void CoalesceEngine::processNextMemoryReadEvent() { + assert(!nextMemoryReadEvent.pending()); if (memQueueFull()) { - nextMemoryReadEvent.sleep(); // TODO: Implement interface where events of the CoalesceEngine are // pushed to a fifo to be scheduled later. + nextMemoryReadEvent.sleep(); + if (!pendingMemRetry()) { + assert(pendingEventQueue.empty()); + requestMemRetry(1); + } + pendingEventQueue.push_back("nextMemoryReadEvent"); return; } @@ -351,8 +357,33 @@ CoalesceEngine::processNextRespondEvent() void CoalesceEngine::recvMemRetry() { - // assert(!nextEvictEvent.scheduled()); - // schedule(nextEvictEvent, nextCycle()); + assert(!pendingEventQueue.empty()); + std::string front = pendingEventQueue.front(); + + if (front == "nextMemoryReadEvent") { + assert(!nextMemoryReadEvent.scheduled()); + assert(nextMemoryReadEvent.pending()); + schedule(nextMemoryReadEvent, nextCycle()); + nextMemoryReadEvent.wake(); + } else if (front == "nextWriteBackEvent") { + assert(!nextWriteBackEvent.scheduled()); + assert(nextWriteBackEvent.pending()); + schedule(nextWriteBackEvent, nextCycle()); + nextWriteBackEvent.wake(); + } else if (front == "nextSendRetryEvent") { + assert(!nextSendRetryEvent.scheduled()); + assert(nextSendRetryEvent.pending()); + breakPointFunction(); + schedule(nextSendRetryEvent, nextCycle()); + nextSendRetryEvent.wake(); + } else { + panic("EVENT IS NOT RECOGNIZED.\n"); + } + + pendingEventQueue.pop_front(); + if (!pendingEventQueue.empty()) { + requestMemRetry(1); + } return; } @@ -652,10 +683,16 @@ CoalesceEngine::processNextApplyEvent() void CoalesceEngine::processNextWriteBackEvent() { + assert(!nextWriteBackEvent.pending()); if (memQueueFull()) { nextWriteBackEvent.sleep(); // TODO: Implement interface where events of the CoalesceEngine are // pushed to a fifo to be scheduled later. + if (!pendingMemRetry()) { + assert(pendingEventQueue.empty()); + requestMemRetry(1); + } + pendingEventQueue.push_back("nextWriteBackEvent"); return; } @@ -715,20 +752,25 @@ void CoalesceEngine::recvPushRetry() { numRetriesReceived++; - if (!nextSendRetryEvent.scheduled()) { - schedule(nextSendRetryEvent, nextCycle()); - } + // For now since we do only one retry at a time, we should not receive + // a retry while this nextSendingRetryEvent is scheduled or is pending. + assert(!nextSendRetryEvent.pending()); + assert(!nextSendRetryEvent.scheduled()); + assert(numRetriesReceived == 1); + schedule(nextSendRetryEvent, nextCycle()); } void CoalesceEngine::processNextSendRetryEvent() { - if (needsPush.count() == 0) { - DPRINTF(CoalesceEngine, "%s: Received a retry while there are no set " - "bit in needsPush. Rejecting the retry.\n", __func__); - peerPushEngine->recvRetryReject(); - return; - } + assert(!nextSendRetryEvent.pending()); + assert(needsPush.count() != 0); + // if (needsPush.count() == 0) { + // DPRINTF(CoalesceEngine, "%s: Received a retry while there are no set " + // "bit in needsPush. Rejecting the retry.\n", __func__); + // peerPushEngine->recvRetryReject(); + // return; + // } DPRINTF(CoalesceEngine, "%s: Received a push retry.\n", __func__); Addr block_addr = 0; @@ -807,6 +849,16 @@ CoalesceEngine::processNextSendRetryEvent() } } } else { + if (memQueueFull()) { + nextSendRetryEvent.sleep(); + if (!pendingMemRetry()) { + assert(pendingEventQueue.empty()); + requestMemRetry(1); + } + pendingEventQueue.push_back("nextSendRetryEvent"); + return; + } + // FIXME: Fix the retry mechanism between memory and cache to // handle memory retries correctly. This probably requires scheduling // an event for sending the retry. For now we're enabling infinite @@ -816,17 +868,12 @@ CoalesceEngine::processNextSendRetryEvent() PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize); SenderState* sender_state = new SenderState(true); pkt->pushSenderState(sender_state); - if (allocateMemQueueSpace(1)) { - enqueueMemReq(pkt); - } else { - requestMemRetry(1); - } + enqueueMemReq(pkt); } numRetriesReceived--; - if ((numRetriesReceived > 0) && (!nextSendRetryEvent.scheduled())) { - schedule(nextSendRetryEvent, nextCycle()); - } + assert(numRetriesReceived == 0); + assert(!nextSendRetryEvent.scheduled()); } CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 563fa671b3..83ca6e5f14 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -124,6 +124,8 @@ class CoalesceEngine : public BaseMemEngine int getBitIndexBase(Addr addr); Addr getBlockAddrFromBitIndex(int index); + std::deque pendingEventQueue; + MemoryEvent nextMemoryReadEvent; void processNextMemoryReadEvent(); @@ -136,7 +138,7 @@ class CoalesceEngine : public BaseMemEngine MemoryEvent nextWriteBackEvent; void processNextWriteBackEvent(); - EventFunctionWrapper nextSendRetryEvent; + MemoryEvent nextSendRetryEvent; void processNextSendRetryEvent(); struct CoalesceStats : public statistics::Group @@ -159,6 +161,7 @@ class CoalesceEngine : public BaseMemEngine CoalesceStats stats; + void breakPointFunction() { std::cout << "Salaam." << std::endl; } protected: virtual int respBuffSize() { return -1; } virtual void recvMemRetry(); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 5ab8db401c..c64ff003c4 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -43,6 +43,7 @@ PushEngine::PushEngine(const PushEngineParams ¶ms): numTotalRetries(0), numPendingRetries(0), nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()), nextPushEvent([this] { processNextPushEvent(); }, name()), + nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()), stats(*this) {} @@ -121,8 +122,8 @@ PushEngine::deallocatePushSpace(int space) (numPendingRetries == 0)) { DPRINTF(PushEngine, "%s: Sent a push retry to " "peerCoalesceEngine.\n", __func__); - numPendingRetries++; - peerCoalesceEngine->recvPushRetry(); + assert(!nextSendRetryEvent.scheduled()); + schedule(nextSendRetryEvent, nextCycle()); } } } @@ -221,8 +222,8 @@ PushEngine::processNextAddrGenEvent() (numPendingRetries == 0)) { DPRINTF(PushEngine, "%s: Sent a push retry to " "peerCoalesceEngine.\n", __func__); - numPendingRetries++; - peerCoalesceEngine->recvPushRetry(); + assert(!nextSendRetryEvent.scheduled()); + schedule(nextSendRetryEvent, nextCycle()); } } } @@ -239,6 +240,14 @@ PushEngine::processNextAddrGenEvent() } } +void +PushEngine::processNextSendRetryEvent() +{ + assert(numPendingRetries == 0); + numPendingRetries++; + peerCoalesceEngine->recvPushRetry(); +} + void PushEngine::recvMemRetry() { diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index a3a308554f..378cd1a487 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -128,6 +128,9 @@ class PushEngine : public BaseMemEngine EventFunctionWrapper nextPushEvent; void processNextPushEvent(); + EventFunctionWrapper nextSendRetryEvent; + void processNextSendRetryEvent(); + struct PushStats : public statistics::Group { PushStats(PushEngine &push); From 42ff3b88231d9f69c4f0fcb7ccbddfc2db66d799 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 24 Jul 2022 17:43:22 -0700 Subject: [PATCH 124/247] Adding DPRINTF for structure sizes. --- src/accl/graph/SConscript | 2 +- src/accl/graph/base/base_mem_engine.cc | 47 +++++--- src/accl/graph/base/base_mem_engine.hh | 4 +- src/accl/graph/sega/coalesce_engine.cc | 2 +- src/accl/graph/sega/push_engine.cc | 5 +- src/accl/graph/sega/wl_engine.cc | 151 +++++++++++++++---------- src/accl/graph/sega/wl_engine.hh | 8 +- 7 files changed, 134 insertions(+), 85 deletions(-) diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript index f5f7e962af..7fd3591b2c 100644 --- a/src/accl/graph/SConscript +++ b/src/accl/graph/SConscript @@ -27,5 +27,5 @@ Import('*') - +DebugFlag('SEGAStructureSize') CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine', 'BaseMemEngine']) diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc index cb4c1d81bb..aa78aac8b5 100644 --- a/src/accl/graph/base/base_mem_engine.cc +++ b/src/accl/graph/base/base_mem_engine.cc @@ -29,6 +29,7 @@ #include "accl/graph/base/base_mem_engine.hh" #include "debug/BaseMemEngine.hh" +#include "debug/SEGAStructureSize.hh" namespace gem5 { @@ -37,7 +38,7 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams ¶ms): ClockedObject(params), system(params.system), memPort(name() + ".mem_port", this), - outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size), + memQueueSize(params.outstanding_mem_req_queue_size), onTheFlyReqs(0), respQueueSize(params.resp_queue_size), memRetryRequested(false), @@ -99,17 +100,22 @@ BaseMemEngine::processNextMemReqEvent() { if ((respQueueSize == 0) || ((respBuffSize() + onTheFlyReqs) < respQueueSize)) { - PacketPtr pkt = outstandingMemReqQueue.front(); + PacketPtr pkt = memQueue.front(); memPort.sendPacket(pkt); onTheFlyReqs++; DPRINTF(BaseMemEngine, "%s: Sent a packet to memory with the following info. " "pkt->addr: %lu, pkt->size: %lu.\n", __func__, pkt->getAddr(), pkt->getSize()); - outstandingMemReqQueue.pop_front(); - + memQueue.pop_front(); + DPRINTF(SEGAStructureSize, "%s: Popped pkt: %s from " + "memQueue. memQueue.size = %d, memQueueSize = %d.\n", + __func__, pkt->print(), memQueue.size(), memQueueSize); + DPRINTF(BaseMemEngine, "%s: Popped pkt: %s from " + "memQueue. memQueue.size = %d, memQueueSize = %d.\n", + __func__, pkt->print(), memQueue.size(), memQueueSize); if (memRetryRequested && - (outstandingMemReqQueue.size() <= - (outstandingMemReqQueueSize - memSpaceRequested))) { + (memQueue.size() <= + (memQueueSize - memSpaceRequested))) { memRetryRequested = false; memSpaceRequested = 0; recvMemRetry(); @@ -117,7 +123,7 @@ BaseMemEngine::processNextMemReqEvent() } if ((!memPort.blocked()) && - (!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) { + (!memQueue.empty()) && (!nextMemReqEvent.scheduled())) { schedule(nextMemReqEvent, nextCycle()); } } @@ -156,30 +162,35 @@ BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data) bool BaseMemEngine::allocateMemQueueSpace(int space) { - assert((outstandingMemReqQueueSize == 0) || - (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize)); + assert((memQueueSize == 0) || + (memQueue.size() <= memQueueSize)); return ( - (outstandingMemReqQueueSize == 0) || - (outstandingMemReqQueue.size() <= (outstandingMemReqQueueSize - space)) + (memQueueSize == 0) || + (memQueue.size() <= (memQueueSize - space)) ); } bool BaseMemEngine::memQueueFull() { - assert((outstandingMemReqQueueSize == 0) || - (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize)); + assert((memQueueSize == 0) || + (memQueue.size() <= memQueueSize)); return ( - (outstandingMemReqQueueSize != 0) && - (outstandingMemReqQueue.size() == outstandingMemReqQueueSize)); + (memQueueSize != 0) && + (memQueue.size() == memQueueSize)); } void BaseMemEngine::enqueueMemReq(PacketPtr pkt) { panic_if(memQueueFull(), "Should not enqueue if queue full.\n"); - outstandingMemReqQueue.push_back(pkt); - + memQueue.push_back(pkt); + DPRINTF(SEGAStructureSize, "%s: Pushed pkt: %s to memQueue. " + "memQueue.size = %d, memQueueSize = %d.\n", __func__, + pkt->print(), memQueue.size(), memQueueSize); + DPRINTF(BaseMemEngine, "%s: Pushed pkt: %s to memQueue. " + "memQueue.size = %d, memQueueSize = %d.\n", __func__, + pkt->print(), memQueue.size(), memQueueSize); if ((!nextMemReqEvent.scheduled()) && (!memPort.blocked())) { schedule(nextMemReqEvent, nextCycle()); } @@ -199,7 +210,7 @@ void BaseMemEngine::wakeUp() { assert(!nextMemReqEvent.scheduled()); - if (!outstandingMemReqQueue.empty()) { + if (!memQueue.empty()) { schedule(nextMemReqEvent, nextCycle()); } } diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh index 64ef49ee1d..520970c5a0 100644 --- a/src/accl/graph/base/base_mem_engine.hh +++ b/src/accl/graph/base/base_mem_engine.hh @@ -68,12 +68,12 @@ class BaseMemEngine : public ClockedObject System* system; MemPort memPort; - int outstandingMemReqQueueSize; + int memQueueSize; int onTheFlyReqs; int respQueueSize; bool memRetryRequested; int memSpaceRequested; - std::deque outstandingMemReqQueue; + std::deque memQueue; EventFunctionWrapper nextMemReqEvent; void processNextMemReqEvent(); diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 4a0600e9c0..ea572ea749 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -862,7 +862,7 @@ CoalesceEngine::processNextSendRetryEvent() // FIXME: Fix the retry mechanism between memory and cache to // handle memory retries correctly. This probably requires scheduling // an event for sending the retry. For now we're enabling infinite - // queueing in the outstandingMemReqQueue. + // queueing in the memQueue. // FIXME: Also do not send requests for cache lines that are already // read but await data. Just set a flag or sth. PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index c64ff003c4..d745dabef6 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -222,8 +222,9 @@ PushEngine::processNextAddrGenEvent() (numPendingRetries == 0)) { DPRINTF(PushEngine, "%s: Sent a push retry to " "peerCoalesceEngine.\n", __func__); - assert(!nextSendRetryEvent.scheduled()); - schedule(nextSendRetryEvent, nextCycle()); + if (!nextSendRetryEvent.scheduled()) { + schedule(nextSendRetryEvent, nextCycle()); + } } } } diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 79bf046ba3..2d4ffc9cac 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -28,6 +28,7 @@ #include "accl/graph/sega/wl_engine.hh" +#include "debug/SEGAStructureSize.hh" #include "debug/WLEngine.hh" #include "mem/packet_access.hh" @@ -39,7 +40,7 @@ WLEngine::WLEngine(const WLEngineParams ¶ms): respPort(name() + ".resp_port", this), coalesceEngine(params.coalesce_engine), updateQueueSize(params.update_queue_size), - onTheFlyUpdateMapSize(params.on_the_fly_update_map_size), + registerFileSize(params.on_the_fly_update_map_size), nextReadEvent([this]{ processNextReadEvent(); }, name()), nextReduceEvent([this]{ processNextReduceEvent(); }, name()), stats(*this) @@ -129,45 +130,68 @@ WLEngine::processNextReadEvent() uint32_t update_value; std::tie(update_addr, update_value) = updateQueue.front(); - DPRINTF(WLEngine, "%s: Looking at the front of the updateQueue. Addr: %lu, " - "value: %u.\n", __func__, update_addr, update_value); + DPRINTF(WLEngine, "%s: Looking at the front of the updateQueue. " + "(addr: %lu, value: %u).\n", __func__, update_addr, update_value); - if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) { - DPRINTF(WLEngine, "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n", - __func__, update_addr); - if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) { + if ((registerFile.find(update_addr) == registerFile.end())) { + DPRINTF(WLEngine, "%s: No register already allocated for addr: %lu " + "in registerFile.\n", __func__, update_addr); + if (registerFile.size() < registerFileSize) { + DPRINTF(WLEngine, "%s: There are free registers available in the " + "registerFile.\n", __func__); + // TODO: It might be a good idea for WLEngine to act differently + // on cache rejects. As a first step the cache should not just + // return a boolean value. It should return an integer/enum + // to tell WLEngine why it rejected the read request. Their might + // be things that WLEngine can do to fix head of the line blocking. if (coalesceEngine->recvWLRead(update_addr)) { - onTheFlyUpdateMap[update_addr] = update_value; - DPRINTF(WLEngine, "%s: Added a new item to onTheFlyUpdateMap. " - "onTheFlyUpdateMap[%lu] = %u.\n", __func__, - update_addr, onTheFlyUpdateMap[update_addr]); + DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read " + "request to addr: %lu.\n", __func__, update_addr); + registerFile[update_addr] = update_value; + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, value: %u) " + "to registerFile. registerFile.size = %d, " + "registerFileSize = %d.\n", __func__, update_addr, + update_value, registerFile.size(), registerFileSize); + DPRINTF(WLEngine, "%s: Added (addr: %lu, value: %u) " + "to registerFile. registerFile.size = %d, " + "registerFileSize = %d.\n", __func__, update_addr, + update_value, registerFile.size(), registerFileSize); updateQueue.pop_front(); - DPRINTF(WLEngine, "%s: Popped an item from the front of updateQueue" - ". updateQueue.size = %u.\n", - __func__, updateQueue.size()); + DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) " + "from updateQueue. updateQueue.size = %d. " + "updateQueueSize = %d.\n", __func__, update_addr, + update_value, updateQueue.size(), updateQueueSize); + DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) " + "from updateQueue. updateQueue.size = %d. " + "updateQueueSize = %d.\n", __func__, update_addr, + update_value, updateQueue.size(), updateQueueSize); respPort.checkRetryReq(); } } } else { // TODO: Generalize this to reduce function rather than just min - DPRINTF(WLEngine, "%s: Found the addr: %lu in onTheFlyUpdateMap. " - "onTheFlyUpdateMap[%lu] = %u.\n", __func__, update_addr, - update_addr, onTheFlyUpdateMap[update_addr]); - onTheFlyUpdateMap[update_addr] = - std::min(update_value, onTheFlyUpdateMap[update_addr]); - DPRINTF(WLEngine, "%s: Reduced the update_value with the entry in " - "onTheFlyUpdateMap. onTheFlyUpdateMap[%lu] = %u.\n", - __func__, update_addr, onTheFlyUpdateMap[update_addr]); - stats.onTheFlyCoalesce++; + DPRINTF(WLEngine, "%s: A register has already been allocated for " + "addr: %lu in registerFile. registerFile[%lu] = %u.\n", + __func__, update_addr, update_addr, registerFile[update_addr]); + registerFile[update_addr] = + std::min(update_value, registerFile[update_addr]); + DPRINTF(WLEngine, "%s: Reduced the update_value: %u with the entry in" + " registerFile. registerFile[%lu] = %u.\n", __func__, + update_value, update_addr, registerFile[update_addr]); + stats.registerFileCoalesce++; updateQueue.pop_front(); - DPRINTF(WLEngine, "%s: Popped an item from the front of updateQueue" - ". updateQueue.size = %u.\n", - __func__, updateQueue.size()); + DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) " + "from updateQueue. updateQueue.size = %d. " + "updateQueueSize = %d.\n", __func__, update_addr, + update_value, updateQueue.size(), updateQueueSize); + DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) " + "from updateQueue. updateQueue.size = %d. " + "updateQueueSize = %d.\n", __func__, update_addr, + update_value, updateQueue.size(), updateQueueSize); respPort.checkRetryReq(); } - // TODO: Only schedule nextReadEvent only when it has to be scheduled - if ((!nextReadEvent.scheduled()) && (!updateQueue.empty())) { + if (!updateQueue.empty() && (!nextReadEvent.scheduled())) { schedule(nextReadEvent, nextCycle()); } } @@ -175,14 +199,16 @@ WLEngine::processNextReadEvent() void WLEngine::handleIncomingWL(Addr addr, WorkListItem wl) { - assert(addrWorkListMap.size() <= onTheFlyUpdateMapSize); + assert(workListFile.size() <= registerFileSize); - addrWorkListMap[addr] = wl; - DPRINTF(WLEngine, "%s: Received a WorkListItem from the coalesceEngine. Adding" - " it to the addrWorkListMap. addrWorkListMap[%lu] = %s.\n", - __func__, addr, wl.to_string()); - - assert(!addrWorkListMap.empty()); + workListFile[addr] = wl; + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) to " + "workListFile. workListFile.size = %d.\n", __func__, addr, + wl.to_string(), workListFile.size()); + DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to " + "workListFile. workListFile.size = %d.\n", __func__, addr, + wl.to_string(), workListFile.size()); + assert(!workListFile.empty()); if (!nextReduceEvent.scheduled()) { schedule(nextReduceEvent, nextCycle()); } @@ -191,28 +217,31 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl) void WLEngine::processNextReduceEvent() { - for (auto &it : addrWorkListMap) { + for (auto &it : workListFile) { Addr addr = it.first; - assert(onTheFlyUpdateMap.find(addr) != onTheFlyUpdateMap.end()); - uint32_t update_value = onTheFlyUpdateMap[addr]; - DPRINTF(WLEngine, "%s: Reducing between onTheFlyUpdateMap and " - "addrWorkListMap values. onTheFlyUpdateMap[%lu] = %u, " - "addrWorkListMap[%lu] = %s.\n", __func__, - addr, onTheFlyUpdateMap[addr], - addr, addrWorkListMap[addr].to_string()); + assert(registerFile.find(addr) != registerFile.end()); + uint32_t update_value = registerFile[addr]; + DPRINTF(WLEngine, "%s: Reducing between registerFile and workListFile" + ". registerFile[%lu] = %u, workListFile[%lu] = %s.\n", + __func__, addr, registerFile[addr], + addr, workListFile[addr].to_string()); // TODO: Generalize this to reduce function rather than just min - addrWorkListMap[addr].tempProp = - std::min(update_value, addrWorkListMap[addr].tempProp); - DPRINTF(WLEngine, "%s: Reduction done. addrWorkListMap[%lu] = %s.\n", - __func__, addr, addrWorkListMap[addr].to_string()); + workListFile[addr].tempProp = + std::min(update_value, workListFile[addr].tempProp); + DPRINTF(WLEngine, "%s: Reduction done. workListFile[%lu] = %s.\n", + __func__, addr, workListFile[addr].to_string()); stats.numReduce++; - coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]); - onTheFlyUpdateMap.erase(addr); - DPRINTF(WLEngine, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n", - __func__, addr); + coalesceEngine->recvWLWrite(addr, workListFile[addr]); + registerFile.erase(addr); + DPRINTF(SEGAStructureSize, "%s: Removed addr: %lu from registerFile. " + "registerFile.size = %d, registerFileSize = %d\n", + __func__, addr, registerFile.size(), registerFileSize); + DPRINTF(WLEngine, "%s: Removed addr: %lu from registerFile. " + "registerFile.size = %d, registerFileSize = %d\n", + __func__, addr, registerFile.size(), registerFileSize); } - addrWorkListMap.clear(); + workListFile.clear(); } bool @@ -224,11 +253,19 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt) } updateQueue.emplace_back(pkt->getAddr(), pkt->getLE()); - DPRINTF(WLEngine, "%s: Pushed an item to the back of updateQueue" - ". updateQueue.size = %u.\n", - __func__, updateQueue.size()); + DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the " + "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n", + __func__, pkt->getAddr(), pkt->getLE(), + updateQueue.size(), updateQueueSize); + DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the " + "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n", + __func__, pkt->getAddr(), pkt->getLE(), + updateQueue.size(), updateQueueSize); + + + // delete the packet since it's not needed anymore. delete pkt; - assert(!updateQueue.empty()); + if (!nextReadEvent.scheduled()) { schedule(nextReadEvent, nextCycle()); } @@ -241,7 +278,7 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl) ADD_STAT(numReduce, statistics::units::Count::get(), "Number of memory blocks read for vertecies"), - ADD_STAT(onTheFlyCoalesce, statistics::units::Count::get(), + ADD_STAT(registerFileCoalesce, statistics::units::Count::get(), "Number of memory blocks read for vertecies") { } diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 27fc3efa7a..79fe60f6d0 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -74,10 +74,10 @@ class WLEngine : public BaseReduceEngine int updateQueueSize; std::deque> updateQueue; - int onTheFlyUpdateMapSize; - std::unordered_map onTheFlyUpdateMap; + int registerFileSize; + std::unordered_map registerFile; - std::unordered_map addrWorkListMap; + std::unordered_map workListFile; void recvFunctional(PacketPtr pkt); @@ -98,7 +98,7 @@ class WLEngine : public BaseReduceEngine WLEngine &wl; statistics::Scalar numReduce; - statistics::Scalar onTheFlyCoalesce; + statistics::Scalar registerFileCoalesce; }; WorkListStats stats; From 5f513830921f24659a9e7fcb8aea10720a27840a Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 24 Jul 2022 17:44:06 -0700 Subject: [PATCH 125/247] Updating config script for sega. --- configs/accl/sega.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index ffd74241e7..cf189733f0 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -9,20 +9,20 @@ class MPU(SubSystem): def __init__(self, base_edge_addr): super(MPU, self).__init__() self.push_engine = PushEngine(base_edge_addr=base_edge_addr, - push_req_queue_size=4, + push_req_queue_size=16, attached_memory_atom_size=64, - outstanding_mem_req_queue_size=1, - resp_queue_size=1) + outstanding_mem_req_queue_size=4, + resp_queue_size=8) self.coalesce_engine = CoalesceEngine( peer_push_engine=self.push_engine, attached_memory_atom_size=32, cache_size="1MiB", - num_mshr_entry=1, - num_tgts_per_mshr=1, - outstanding_mem_req_queue_size=1) + num_mshr_entry=8, + num_tgts_per_mshr=8, + outstanding_mem_req_queue_size=8) self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, - update_queue_size=1, - on_the_fly_update_map_size=1) + update_queue_size=16, + on_the_fly_update_map_size=8) def getRespPort(self): return self.wl_engine.resp_port From ed206a8acdb86f3aa17df9e1d3d44e241385c67e Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Sun, 24 Jul 2022 18:14:08 -0700 Subject: [PATCH 126/247] Adding more assertion for MSHR and fillQueue. --- configs/accl/sega.py | 12 ++++++------ src/accl/graph/sega/coalesce_engine.cc | 3 +++ src/accl/graph/sega/push_engine.cc | 2 +- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index cf189733f0..8fb3b75996 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -16,13 +16,13 @@ def __init__(self, base_edge_addr): self.coalesce_engine = CoalesceEngine( peer_push_engine=self.push_engine, attached_memory_atom_size=32, - cache_size="1MiB", - num_mshr_entry=8, - num_tgts_per_mshr=8, - outstanding_mem_req_queue_size=8) + cache_size="128B", + num_mshr_entry=1, + num_tgts_per_mshr=1, + outstanding_mem_req_queue_size=0) self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, - update_queue_size=16, - on_the_fly_update_map_size=8) + update_queue_size=1, + on_the_fly_update_map_size=4) def getRespPort(self): return self.wl_engine.resp_port diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index ea572ea749..8f56962a8c 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -232,6 +232,7 @@ CoalesceEngine::recvWLRead(Addr addr) return true; } else { assert(!cacheBlocks[block_index].valid); + assert(MSHR[block_index].size() == 0); // MSHR available and no conflict DPRINTF(CoalesceEngine, "%s: Addr: %lu has no conflict. " "Allocating a cache line for it.\n" @@ -251,6 +252,7 @@ CoalesceEngine::recvWLRead(Addr addr) // enqueueMemReq(pkt); fillQueue.push_back(block_index); + assert(fillQueue.size() <= numLines); // FIXME: Fix this DPRINTF // DPRINTF(CoalesceEngine, "%s: Pushed pkt index " // "lineFillBuffer. lineFillBuffer.size = %d.\n", @@ -737,6 +739,7 @@ CoalesceEngine::processNextWriteBackEvent() DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for " "Addr: %lu.\n", __func__, block_index, aligned_miss_addr); fillQueue.push_back(block_index); + assert(fillQueue.size() <= numLines); } writeBackQueue.pop_front(); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index d745dabef6..a41ca8a778 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -217,7 +217,7 @@ PushEngine::processNextAddrGenEvent() int free_space = pushReqQueueSize - (pushReqQueue.size() + (numPendingRetries * numElementsPerLine)); DPRINTF(PushEngine, "%s: pushReqQueue has at least %d" - "free spaces.\n", __func__, free_space); + " free spaces.\n", __func__, free_space); if ((free_space >= numElementsPerLine) && (numPendingRetries == 0)) { DPRINTF(PushEngine, "%s: Sent a push retry to " From cdfd9817d9a3908fc86b2ec1f95420524b953ea3 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Sun, 24 Jul 2022 18:27:10 -0700 Subject: [PATCH 127/247] Adding debug flags for responseQueue size. --- src/accl/graph/sega/coalesce_engine.cc | 41 +++++++++++++++++++------- src/accl/graph/sega/wl_engine.hh | 2 ++ 2 files changed, 32 insertions(+), 11 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 8f56962a8c..959bfa9743 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -32,6 +32,7 @@ #include "base/intmath.hh" #include "debug/ApplyUpdates.hh" #include "debug/CoalesceEngine.hh" +#include "debug/SEGAStructureSize.hh" #include "mem/packet_access.hh" namespace gem5 @@ -168,11 +169,18 @@ CoalesceEngine::recvWLRead(Addr addr) // the future. responseQueue.push_back(std::make_tuple(addr, cacheBlocks[block_index].items[wl_offset])); - DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s " - "to responseQueue. responseQueue.size = %d.\n", - __func__, addr, block_index, wl_offset, - cacheBlocks[block_index].items[wl_offset].to_string(), - responseQueue.size()); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d, " + "responseQueueSize = %d.\n", __func__, addr, + cacheBlocks[block_index].items[wl_offset].to_string(), + responseQueue.size(), + peerWLEngine->getRegisterFileSize()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d, " + "responseQueueSize = %d.\n", __func__, addr, + cacheBlocks[block_index].items[wl_offset].to_string(), + responseQueue.size(), + peerWLEngine->getRegisterFileSize()); // TODO: Add a stat to count the number of WLItems that have been touched. cacheBlocks[block_index].busyMask |= (1 << wl_offset); stats.readHits++; @@ -345,9 +353,12 @@ CoalesceEngine::processNextRespondEvent() __func__, worklist_response.to_string(), addr_response); responseQueue.pop_front(); + DPRINTF(SEGAStructureSize, "%s: Popped a response from responseQueue. " + "responseQueue.size = %d, responseQueueSize = %d.\n", __func__, + responseQueue.size(), peerWLEngine->getRegisterFileSize()); DPRINTF(CoalesceEngine, "%s: Popped a response from responseQueue. " - "responseQueue.size = %d.\n", __func__, - responseQueue.size()); + "responseQueue.size = %d, responseQueueSize = %d.\n", __func__, + responseQueue.size(), peerWLEngine->getRegisterFileSize()); if ((!nextRespondEvent.scheduled()) && (!responseQueue.empty())) { @@ -536,10 +547,18 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) // TODO: Make this block of code into a function responseQueue.push_back(std::make_tuple(miss_addr, cacheBlocks[block_index].items[wl_offset])); - DPRINTF(CoalesceEngine, "%s: Pushed cacheBlocks[%d][%d] to " - "responseQueue. responseQueue.size = %u.\n" - , __func__, block_index, wl_offset, - responseQueue.size()); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d, " + "responseQueueSize = %d.\n", __func__, miss_addr, + cacheBlocks[block_index].items[wl_offset].to_string(), + responseQueue.size(), + peerWLEngine->getRegisterFileSize()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d, " + "responseQueueSize = %d.\n", __func__, addr, + cacheBlocks[block_index].items[wl_offset].to_string(), + responseQueue.size(), + peerWLEngine->getRegisterFileSize()); // TODO: Add a stat to count the number of WLItems that have been touched. cacheBlocks[block_index].busyMask |= (1 << wl_offset); // End of the said block diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 79fe60f6d0..5e8e5b25f3 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -114,6 +114,8 @@ class WLEngine : public BaseReduceEngine bool handleIncomingUpdate(PacketPtr pkt); void handleIncomingWL(Addr addr, WorkListItem wl); + + int getRegisterFileSize() { return registerFileSize; } }; } From 4a466aec9457f93be6bfa689489c8376c08d31c6 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Sun, 24 Jul 2022 18:33:53 -0700 Subject: [PATCH 128/247] Adding assertions to test the size of queues in coalesce engine. --- src/accl/graph/sega/coalesce_engine.cc | 10 +++++++++- src/accl/graph/sega/coalesce_engine.hh | 1 - 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 959bfa9743..753bfc988b 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -49,7 +49,6 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): numRetriesReceived(0), applyQueue(numLines), writeBackQueue(numLines), - replaceQueue(numLines), nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()), nextRespondEvent([this] { processNextRespondEvent(); }, name()), nextApplyEvent([this] { processNextApplyEvent(); }, name()), @@ -320,6 +319,8 @@ CoalesceEngine::processNextMemoryReadEvent() requestMemRetry(1); } pendingEventQueue.push_back("nextMemoryReadEvent"); + // Maximum three MemoryEvent. + assert(pendingEventQueue.size() <= 3); return; } @@ -460,6 +461,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) } if (cacheBlocks[block_index].hasConflict) { writeBackQueue.push_back(block_index); + assert(writeBackQueue.size() <= numLines); if ((!nextWriteBackEvent.pending()) && (!nextWriteBackEvent.scheduled())) { schedule(nextWriteBackEvent, nextCycle()); @@ -683,6 +685,7 @@ CoalesceEngine::processNextApplyEvent() // TODO: This is where eviction policy goes if (cacheBlocks[block_index].hasConflict){ writeBackQueue.push_back(block_index); + assert(writeBackQueue.size() <= numLines); DPRINTF(CoalesceEngine, "%s: Added %d to writeBackQueue. writeBackQueue.size = %u.\n", __func__, block_index, writeBackQueue.size()); } @@ -714,6 +717,8 @@ CoalesceEngine::processNextWriteBackEvent() requestMemRetry(1); } pendingEventQueue.push_back("nextWriteBackEvent"); + // Maximum three MemoryEvent. + assert(pendingEventQueue.size() <= 3); return; } @@ -863,6 +868,7 @@ CoalesceEngine::processNextSendRetryEvent() } if (cacheBlocks[block_index].hasConflict) { writeBackQueue.push_back(block_index); + assert(writeBackQueue.size() <= numLines); if ((!writeBackQueue.empty()) && (!nextWriteBackEvent.pending()) && (!nextWriteBackEvent.scheduled())) { @@ -878,6 +884,8 @@ CoalesceEngine::processNextSendRetryEvent() requestMemRetry(1); } pendingEventQueue.push_back("nextSendRetryEvent"); + // Maximum three MemoryEvent. + assert(pendingEventQueue.size() <= 3); return; } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 83ca6e5f14..cfa0a79102 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -118,7 +118,6 @@ class CoalesceEngine : public BaseMemEngine std::bitset needsPush; InOutSet writeBackQueue; - InOutSet replaceQueue; int getBlockIndex(Addr addr); int getBitIndexBase(Addr addr); From 48711528ef72651cccb68b08303159ce8b3fc071 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Sun, 24 Jul 2022 22:43:28 -0700 Subject: [PATCH 129/247] Checking the size of queues in PushEngine and WLEngine --- src/accl/graph/base/base_mem_engine.cc | 2 +- src/accl/graph/base/base_mem_engine.hh | 3 ++- src/accl/graph/sega/push_engine.cc | 2 ++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc index aa78aac8b5..590307b2bc 100644 --- a/src/accl/graph/base/base_mem_engine.cc +++ b/src/accl/graph/base/base_mem_engine.cc @@ -40,10 +40,10 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams ¶ms): memPort(name() + ".mem_port", this), memQueueSize(params.outstanding_mem_req_queue_size), onTheFlyReqs(0), - respQueueSize(params.resp_queue_size), memRetryRequested(false), memSpaceRequested(0), nextMemReqEvent([this] { processNextMemReqEvent(); }, name()), + respQueueSize(params.resp_queue_size), _requestorId(system->getRequestorId(this)), peerMemoryAtomSize(params.attached_memory_atom_size) {} diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh index 520970c5a0..01c862d555 100644 --- a/src/accl/graph/base/base_mem_engine.hh +++ b/src/accl/graph/base/base_mem_engine.hh @@ -70,7 +70,6 @@ class BaseMemEngine : public ClockedObject int memQueueSize; int onTheFlyReqs; - int respQueueSize; bool memRetryRequested; int memSpaceRequested; std::deque memQueue; @@ -79,6 +78,8 @@ class BaseMemEngine : public ClockedObject void processNextMemReqEvent(); protected: + + int respQueueSize; const RequestorID _requestorId; size_t peerMemoryAtomSize; diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index a41ca8a778..cfebf8e5df 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -173,6 +173,7 @@ PushEngine::recvWLItemRetry(WorkListItem wl) pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), peerMemoryAtomSize, value); + assert(pushReqQueue.size() <= pushReqQueueSize); DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n", __func__, pushReqQueue.size()); @@ -263,6 +264,7 @@ PushEngine::handleMemResp(PacketPtr pkt) // TODO: in case we need to edit edges, get rid of second statement. assert(pkt->isResponse() && (!pkt->isWrite())); memRespQueue.push_back(pkt); + assert(memRespQueue.size() <= respQueueSize); if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) { schedule(nextPushEvent, nextCycle()); From 29ae1de4908cf215a44bcd8c9db9091c8306cf1b Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 25 Jul 2022 09:33:11 -0700 Subject: [PATCH 130/247] Making CoalesceEngine a BaseMemoryEngine. --- configs/accl/sega.py | 13 ++- src/accl/graph/sega/BaseMemoryEngine.py | 42 ++++++++ src/accl/graph/sega/CoalesceEngine.py | 17 ++- src/accl/graph/sega/SConscript | 3 + src/accl/graph/sega/base_memory_engine.cc | 122 ++++++++++++++++++++++ src/accl/graph/sega/base_memory_engine.hh | 99 ++++++++++++++++++ src/accl/graph/sega/coalesce_engine.cc | 70 ++++--------- src/accl/graph/sega/coalesce_engine.hh | 14 +-- 8 files changed, 305 insertions(+), 75 deletions(-) create mode 100644 src/accl/graph/sega/BaseMemoryEngine.py create mode 100644 src/accl/graph/sega/base_memory_engine.cc create mode 100644 src/accl/graph/sega/base_memory_engine.hh diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 8fb3b75996..7577331f2b 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -9,20 +9,19 @@ class MPU(SubSystem): def __init__(self, base_edge_addr): super(MPU, self).__init__() self.push_engine = PushEngine(base_edge_addr=base_edge_addr, - push_req_queue_size=16, + push_req_queue_size=2, attached_memory_atom_size=64, - outstanding_mem_req_queue_size=4, - resp_queue_size=8) + outstanding_mem_req_queue_size=1, + resp_queue_size=1) self.coalesce_engine = CoalesceEngine( peer_push_engine=self.push_engine, attached_memory_atom_size=32, - cache_size="128B", + cache_size="32B", num_mshr_entry=1, - num_tgts_per_mshr=1, - outstanding_mem_req_queue_size=0) + num_tgts_per_mshr=1) self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, update_queue_size=1, - on_the_fly_update_map_size=4) + on_the_fly_update_map_size=1) def getRespPort(self): return self.wl_engine.resp_port diff --git a/src/accl/graph/sega/BaseMemoryEngine.py b/src/accl/graph/sega/BaseMemoryEngine.py new file mode 100644 index 0000000000..10d8b708f0 --- /dev/null +++ b/src/accl/graph/sega/BaseMemoryEngine.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject + +class BaseMemoryEngine(ClockedObject): + abstract = True + type = 'BaseMemoryEngine' + cxx_header = "accl/graph/sega/base_memory_engine.hh" + cxx_class = 'gem5::BaseMemoryEngine' + + system = Param.System(Parent.any, 'System this Engine is a part of') + mem_port = RequestPort("Port to communicate with the memory") + + attached_memory_atom_size = Param.Int(64, "The atom size of the attached " + "memory.") diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index 7667a22c5a..536c3477ae 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -27,21 +27,16 @@ from m5.params import * from m5.proxy import * -from m5.objects.BaseMemEngine import BaseMemEngine +from m5.objects.BaseMemoryEngine import BaseMemoryEngine -class CoalesceEngine(BaseMemEngine): +class CoalesceEngine(BaseMemoryEngine): type = 'CoalesceEngine' cxx_header = "accl/graph/sega/coalesce_engine.hh" cxx_class = 'gem5::CoalesceEngine' - peer_push_engine = Param.PushEngine(NULL, "") - - cache_size = Param.MemorySize("16KiB", "Size of the internal cache.") - - num_mshr_entry = Param.Int(4, "") - num_tgts_per_mshr = Param.Int(20, "") - - # Don't change. If changed. It will break functionality of coalesce. - resp_queue_size = 0 + peer_push_engine = Param.PushEngine(NULL, "PushEngine in the same GPT.") + cache_size = Param.MemorySize("16KiB", "Size of the internal SRAM array.") + num_mshr_entry = Param.Int(4, "Number of MSHR entries.") + num_tgts_per_mshr = Param.Int(20, "Number of Targets Per MSHR.") diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index 77e508f4ed..97a62d44a0 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -27,16 +27,19 @@ Import('*') +SimObject('BaseMemoryEngine.py') SimObject('CenteralController.py') SimObject('CoalesceEngine.py') SimObject('PushEngine.py') SimObject('WLEngine.py') +Source('base_memory_engine.cc') Source('centeral_controller.cc') Source('coalesce_engine.cc') Source('push_engine.cc') Source('wl_engine.cc') +DebugFlag('BaseMemoryEngine') DebugFlag('ApplyUpdates') DebugFlag('CenteralController') DebugFlag('CoalesceEngine') diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc new file mode 100644 index 0000000000..e5e78f2c04 --- /dev/null +++ b/src/accl/graph/sega/base_memory_engine.cc @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/base_memory_engine.hh" + +#include "debug/BaseMemoryEngine.hh" +#include "debug/SEGAStructureSize.hh" + +namespace gem5 +{ + +BaseMemoryEngine::BaseMemoryEngine(const BaseMemoryEngineParams ¶ms): + ClockedObject(params), + system(params.system), + _requestorId(system->getRequestorId(this)), + memPort(name() + ".mem_port", this), + peerMemoryAtomSize(params.attached_memory_atom_size) +{} + +BaseMemoryEngine::~BaseMemoryEngine() +{} + +Port& +BaseMemoryEngine::getPort(const std::string &if_name, PortID idx) +{ + if (if_name == "mem_port") { + return memPort; + } else { + return SimObject::getPort(if_name, idx); + } +} + +void +BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt) +{ + panic_if(_blocked, "Should never try to send if blocked MemSide!"); + if (!sendTimingReq(pkt)) + { + blockedPacket = pkt; + _blocked = true; + } else { + owner->recvMemRetry(); + } +} + +bool +BaseMemoryEngine::MemPort::recvTimingResp(PacketPtr pkt) +{ + return owner->handleMemResp(pkt); +} + +void +BaseMemoryEngine::MemPort::recvReqRetry() +{ + panic_if(!(_blocked && blockedPacket), + "Received retry without a blockedPacket"); + + _blocked = false; + sendPacket(blockedPacket); + + if (!blocked()) { + blockedPacket = nullptr; + } +} + +PacketPtr +BaseMemoryEngine::createReadPacket(Addr addr, unsigned int size) +{ + RequestPtr req = std::make_shared(addr, size, 0, _requestorId); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) _requestorId) << 2); + + // Embed it in a packet + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + pkt->allocate(); + + return pkt; +} + +PacketPtr +BaseMemoryEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data) +{ + RequestPtr req = std::make_shared(addr, size, 0, _requestorId); + + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) _requestorId) << 2); + + PacketPtr pkt = new Packet(req, MemCmd::WriteReq); + pkt->allocate(); + pkt->setData(data); + + return pkt; +} + +} diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh new file mode 100644 index 0000000000..8fb8fde7e6 --- /dev/null +++ b/src/accl/graph/sega/base_memory_engine.hh @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__ + +#include + +#include "base/addr_range.hh" +#include "mem/packet.hh" +#include "mem/port.hh" +#include "params/BaseMemoryEngine.hh" +#include "sim/clocked_object.hh" +#include "sim/system.hh" + +namespace gem5 +{ + +class BaseMemoryEngine : public ClockedObject +{ + private: + class MemPort : public RequestPort + { + private: + BaseMemoryEngine* owner; + bool _blocked; + PacketPtr blockedPacket; + + public: + MemPort(const std::string& name, BaseMemoryEngine* owner): + RequestPort(name, owner), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} + + void sendPacket(PacketPtr pkt); + bool blocked() { return _blocked; } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + protected: + System* system; + const RequestorID _requestorId; + + MemPort memPort; + + size_t peerMemoryAtomSize; + + virtual void recvMemRetry() = 0; + virtual bool handleMemResp(PacketPtr pkt) = 0; + + PacketPtr createReadPacket(Addr addr, unsigned int size); + PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data); + + public: + PARAMS(BaseMemoryEngine); + + BaseMemoryEngine(const Params ¶ms); + ~BaseMemoryEngine(); + + Port& getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; + + AddrRangeList getAddrRanges() { return memPort.getAddrRanges(); } + + void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } + +}; + +} + +#endif // __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__ diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 753bfc988b..678cf0456e 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -38,8 +38,8 @@ namespace gem5 { -CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): - BaseMemEngine(params), +CoalesceEngine::CoalesceEngine(const Params ¶ms): + BaseMemoryEngine(params), peerPushEngine(params.peer_push_engine), numLines((int) (params.cache_size / peerMemoryAtomSize)), numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), @@ -67,12 +67,6 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams ¶ms): needsPush.reset(); } -void -CoalesceEngine::recvFunctional(PacketPtr pkt) -{ - sendMemFunctional(pkt); -} - void CoalesceEngine::startup() { @@ -171,13 +165,13 @@ CoalesceEngine::recvWLRead(Addr addr) DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " "to responseQueue. responseQueue.size = %d, " "responseQueueSize = %d.\n", __func__, addr, - cacheBlocks[block_index].items[wl_offset].to_string(), + cacheBlocks[block_index].items[wl_offset].to_string(), responseQueue.size(), peerWLEngine->getRegisterFileSize()); DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " "to responseQueue. responseQueue.size = %d, " "responseQueueSize = %d.\n", __func__, addr, - cacheBlocks[block_index].items[wl_offset].to_string(), + cacheBlocks[block_index].items[wl_offset].to_string(), responseQueue.size(), peerWLEngine->getRegisterFileSize()); // TODO: Add a stat to count the number of WLItems that have been touched. @@ -257,7 +251,6 @@ CoalesceEngine::recvWLRead(Addr addr) DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " "for cacheBlocks[%d].\n", __func__, addr, block_index); - // enqueueMemReq(pkt); fillQueue.push_back(block_index); assert(fillQueue.size() <= numLines); // FIXME: Fix this DPRINTF @@ -310,16 +303,12 @@ void CoalesceEngine::processNextMemoryReadEvent() { assert(!nextMemoryReadEvent.pending()); - if (memQueueFull()) { + if (memPort.blocked()) { // TODO: Implement interface where events of the CoalesceEngine are // pushed to a fifo to be scheduled later. nextMemoryReadEvent.sleep(); - if (!pendingMemRetry()) { - assert(pendingEventQueue.empty()); - requestMemRetry(1); - } pendingEventQueue.push_back("nextMemoryReadEvent"); - // Maximum three MemoryEvent. + // Maximum three MemoryEvents. assert(pendingEventQueue.size() <= 3); return; } @@ -330,7 +319,7 @@ CoalesceEngine::processNextMemoryReadEvent() DPRINTF(CoalesceEngine, "%s: Created a read packet. addr = %lu, " "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); - enqueueMemReq(pkt); + memPort.sendPacket(pkt); fillQueue.pop_front(); @@ -367,11 +356,13 @@ CoalesceEngine::processNextRespondEvent() } } -// FIXME: Update this for implementing event retry interaction. void CoalesceEngine::recvMemRetry() { - assert(!pendingEventQueue.empty()); + if (pendingEventQueue.empty()) { + return; + } + std::string front = pendingEventQueue.front(); if (front == "nextMemoryReadEvent") { @@ -387,7 +378,6 @@ CoalesceEngine::recvMemRetry() } else if (front == "nextSendRetryEvent") { assert(!nextSendRetryEvent.scheduled()); assert(nextSendRetryEvent.pending()); - breakPointFunction(); schedule(nextSendRetryEvent, nextCycle()); nextSendRetryEvent.wake(); } else { @@ -395,12 +385,10 @@ CoalesceEngine::recvMemRetry() } pendingEventQueue.pop_front(); - if (!pendingEventQueue.empty()) { - requestMemRetry(1); - } return; } +// FIXME: Fix this function. bool CoalesceEngine::handleMemResp(PacketPtr pkt) { @@ -552,13 +540,13 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " "to responseQueue. responseQueue.size = %d, " "responseQueueSize = %d.\n", __func__, miss_addr, - cacheBlocks[block_index].items[wl_offset].to_string(), - responseQueue.size(), + cacheBlocks[block_index].items[wl_offset].to_string(), + responseQueue.size(), peerWLEngine->getRegisterFileSize()); DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " "to responseQueue. responseQueue.size = %d, " "responseQueueSize = %d.\n", __func__, addr, - cacheBlocks[block_index].items[wl_offset].to_string(), + cacheBlocks[block_index].items[wl_offset].to_string(), responseQueue.size(), peerWLEngine->getRegisterFileSize()); // TODO: Add a stat to count the number of WLItems that have been touched. @@ -708,14 +696,8 @@ void CoalesceEngine::processNextWriteBackEvent() { assert(!nextWriteBackEvent.pending()); - if (memQueueFull()) { + if (memPort.blocked()) { nextWriteBackEvent.sleep(); - // TODO: Implement interface where events of the CoalesceEngine are - // pushed to a fifo to be scheduled later. - if (!pendingMemRetry()) { - assert(pendingEventQueue.empty()); - requestMemRetry(1); - } pendingEventQueue.push_back("nextWriteBackEvent"); // Maximum three MemoryEvent. assert(pendingEventQueue.size() <= 3); @@ -744,7 +726,7 @@ CoalesceEngine::processNextWriteBackEvent() DPRINTF(CoalesceEngine, "%s: Created a write packet to " "Addr: %lu, size = %d.\n", __func__, write_pkt->getAddr(), write_pkt->getSize()); - enqueueMemReq(write_pkt); + memPort.sendPacket(write_pkt); } assert(!MSHR[block_index].empty()); Addr miss_addr = MSHR[block_index].front(); @@ -764,6 +746,10 @@ CoalesceEngine::processNextWriteBackEvent() "Addr: %lu.\n", __func__, block_index, aligned_miss_addr); fillQueue.push_back(block_index); assert(fillQueue.size() <= numLines); + if ((!nextMemoryReadEvent.pending()) && + (!nextMemoryReadEvent.scheduled())){ + schedule(nextMemoryReadEvent, nextCycle()); + } } writeBackQueue.pop_front(); @@ -792,12 +778,6 @@ CoalesceEngine::processNextSendRetryEvent() { assert(!nextSendRetryEvent.pending()); assert(needsPush.count() != 0); - // if (needsPush.count() == 0) { - // DPRINTF(CoalesceEngine, "%s: Received a retry while there are no set " - // "bit in needsPush. Rejecting the retry.\n", __func__); - // peerPushEngine->recvRetryReject(); - // return; - // } DPRINTF(CoalesceEngine, "%s: Received a push retry.\n", __func__); Addr block_addr = 0; @@ -877,12 +857,8 @@ CoalesceEngine::processNextSendRetryEvent() } } } else { - if (memQueueFull()) { + if (memPort.blocked()) { nextSendRetryEvent.sleep(); - if (!pendingMemRetry()) { - assert(pendingEventQueue.empty()); - requestMemRetry(1); - } pendingEventQueue.push_back("nextSendRetryEvent"); // Maximum three MemoryEvent. assert(pendingEventQueue.size() <= 3); @@ -898,7 +874,7 @@ CoalesceEngine::processNextSendRetryEvent() PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize); SenderState* sender_state = new SenderState(true); pkt->pushSenderState(sender_state); - enqueueMemReq(pkt); + memPort.sendPacket(pkt); } numRetriesReceived--; diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index cfa0a79102..a322379b05 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -31,7 +31,7 @@ #include -#include "accl/graph/base/base_mem_engine.hh" +#include "accl/graph/sega/base_memory_engine.hh" #include "accl/graph/base/data_structs.hh" #include "accl/graph/sega/push_engine.hh" #include "base/statistics.hh" @@ -39,14 +39,12 @@ #define MAX_BITVECTOR_SIZE (1 << 30) -// TODO: Add parameters for size, memory atom size, type size, -// length of items in the blocks. namespace gem5 { class WLEngine; -class CoalesceEngine : public BaseMemEngine +class CoalesceEngine : public BaseMemoryEngine { private: class MemoryEvent : public EventFunctionWrapper @@ -160,16 +158,14 @@ class CoalesceEngine : public BaseMemEngine CoalesceStats stats; - void breakPointFunction() { std::cout << "Salaam." << std::endl; } protected: - virtual int respBuffSize() { return -1; } virtual void recvMemRetry(); virtual bool handleMemResp(PacketPtr pkt); public: PARAMS(CoalesceEngine); - CoalesceEngine(const CoalesceEngineParams ¶ms); + CoalesceEngine(const Params ¶ms); bool recvWLRead(Addr addr); void recvWLWrite(Addr addr, WorkListItem wl); @@ -178,9 +174,7 @@ class CoalesceEngine : public BaseMemEngine void recvPushRetry(); - void recvFunctional(PacketPtr pkt); - - virtual void startup(); + virtual void startup() override; }; } From bbc7e3afbea04fd283157f89d024f4f9b9c2d78d Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 25 Jul 2022 13:06:22 -0700 Subject: [PATCH 131/247] Fixing cache mapping issue. --- src/accl/graph/SConscript | 3 +- src/accl/graph/sega/base_memory_engine.cc | 14 +++ src/accl/graph/sega/base_memory_engine.hh | 2 + src/accl/graph/sega/coalesce_engine.cc | 105 ++++++++++++---------- src/accl/graph/sega/coalesce_engine.hh | 6 +- 5 files changed, 78 insertions(+), 52 deletions(-) diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript index 7fd3591b2c..53c6411de6 100644 --- a/src/accl/graph/SConscript +++ b/src/accl/graph/SConscript @@ -28,4 +28,5 @@ Import('*') DebugFlag('SEGAStructureSize') -CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine', 'BaseMemEngine']) +CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine', + 'BaseMemEngine', 'BaseMemoryEngine']) diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc index e5e78f2c04..9db95d6bd6 100644 --- a/src/accl/graph/sega/base_memory_engine.cc +++ b/src/accl/graph/sega/base_memory_engine.cc @@ -55,6 +55,20 @@ BaseMemoryEngine::getPort(const std::string &if_name, PortID idx) } } +void +BaseMemoryEngine::init() +{ + AddrRangeList memory_ranges = memPort.getAddrRanges(); + // BaseMemoryEngine only supports one memory. + assert(memory_ranges.size() == 1); + + peerMemoryRange = memory_ranges.front(); + DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is %s. " + "The range is %s interleaved.\n", __func__, + peerMemoryRange.to_string(), + peerMemoryRange.interleaved() ? "" : "not"); +} + void BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt) { diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh index 8fb8fde7e6..efbfa5312d 100644 --- a/src/accl/graph/sega/base_memory_engine.hh +++ b/src/accl/graph/sega/base_memory_engine.hh @@ -69,6 +69,7 @@ class BaseMemoryEngine : public ClockedObject System* system; const RequestorID _requestorId; + AddrRange peerMemoryRange; MemPort memPort; size_t peerMemoryAtomSize; @@ -92,6 +93,7 @@ class BaseMemoryEngine : public ClockedObject void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } + virtual void init() override; }; } diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 678cf0456e..21f048213a 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -67,44 +67,48 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): needsPush.reset(); } -void -CoalesceEngine::startup() -{ - AddrRangeList vertex_ranges = getAddrRanges(); - - bool found = false; - Addr first_match_addr = 0; - while(true) { - for (auto range: vertex_ranges) { - if (range.contains(first_match_addr)) { - found = true; - break; - } - } - if (found) { - break; - } - first_match_addr += peerMemoryAtomSize; - } - - found = false; - Addr second_match_addr = first_match_addr + peerMemoryAtomSize; - while(true) { - for (auto range: vertex_ranges) { - if (range.contains(second_match_addr)) { - found = true; - break; - } - } - if (found) { - break; - } - second_match_addr += peerMemoryAtomSize; - } - - nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize); - memoryAddressOffset = first_match_addr; -} +// void +// CoalesceEngine::startup() +// { +// return; + // std::cout << "Hello" << std::endl; + // DPRINTF(CoalesceEngine, "%s: Range attached to this engine is %s.\n", + // __func__, peerMemoryRange.to_string()); + // AddrRangeList vertex_ranges = getAddrRanges(); + + // bool found = false; + // Addr first_match_addr = 0; + // while(true) { + // for (auto range: vertex_ranges) { + // if (range.contains(first_match_addr)) { + // found = true; + // break; + // } + // } + // if (found) { + // break; + // } + // first_match_addr += peerMemoryAtomSize; + // } + + // found = false; + // Addr second_match_addr = first_match_addr + peerMemoryAtomSize; + // while(true) { + // for (auto range: vertex_ranges) { + // if (range.contains(second_match_addr)) { + // found = true; + // break; + // } + // } + // if (found) { + // break; + // } + // second_match_addr += peerMemoryAtomSize; + // } + + // nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize); + // memoryAddressOffset = first_match_addr; +// } void CoalesceEngine::registerWLEngine(WLEngine* wl_engine) @@ -117,7 +121,10 @@ int CoalesceEngine::getBlockIndex(Addr addr) { assert((addr % peerMemoryAtomSize) == 0); - return ((int) (addr / peerMemoryAtomSize)) % numLines; + Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); + DPRINTF(CoalesceEngine, "%s: Trimming addr: %lu to %lu.\n", + __func__, addr, trimmed_addr); + return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines; } // addr should be aligned to peerMemoryAtomSize @@ -125,10 +132,10 @@ int CoalesceEngine::getBitIndexBase(Addr addr) { assert((addr % peerMemoryAtomSize) == 0); - int atom_index = (int) (addr / (peerMemoryAtomSize * nmpu)); + Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); + int atom_index = (int) (trimmed_addr / peerMemoryAtomSize); int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem)); - int bit_index = atom_index * block_bits; - return bit_index; + return atom_index * block_bits; } // index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem)) @@ -136,9 +143,8 @@ Addr CoalesceEngine::getBlockAddrFromBitIndex(int index) { assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0); - Addr block_addr = (nmpu * peerMemoryAtomSize) * - ((int)(index / (peerMemoryAtomSize / sizeof(WorkListItem)))); - return (block_addr + memoryAddressOffset); + Addr trimmed_addr = index * sizeof(WorkListItem); + return peerMemoryRange.addIntlvBits(trimmed_addr); } bool @@ -149,7 +155,8 @@ CoalesceEngine::recvWLRead(Addr addr) __func__, addr); Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize; assert(aligned_addr % peerMemoryAtomSize == 0); - int block_index = (aligned_addr / peerMemoryAtomSize) % numLines; + // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines; + int block_index = getBlockIndex(aligned_addr); assert(block_index < numLines); int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); assert(wl_offset < numElementsPerLine); @@ -507,7 +514,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) } Addr addr = pkt->getAddr(); - int block_index = (addr / peerMemoryAtomSize) % numLines; + // int block_index = (addr / peerMemoryAtomSize) % numLines; + int block_index = getBlockIndex(addr); DPRINTF(CoalesceEngine, "%s: Received a read resposne for Addr: %lu.\n", __func__, pkt->getAddr()); @@ -591,7 +599,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) { // TODO: Parameterize all the numbers here. Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); - int block_index = (aligned_addr / peerMemoryAtomSize) % numLines; + // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines; + int block_index = getBlockIndex(aligned_addr); int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); DPRINTF(CoalesceEngine, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n", diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index a322379b05..28b204e198 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -91,8 +91,8 @@ class CoalesceEngine : public BaseMemoryEngine SenderState(bool is_retry): isRetry(is_retry) {} }; - int nmpu; - Addr memoryAddressOffset; + // int nmpu; + // Addr memoryAddressOffset; WLEngine* peerWLEngine; PushEngine* peerPushEngine; @@ -174,7 +174,7 @@ class CoalesceEngine : public BaseMemoryEngine void recvPushRetry(); - virtual void startup() override; + // virtual void startup() override; }; } From 6c9e7c8d4c68d72742a39a50918f4df35eaa663c Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 25 Jul 2022 20:51:48 -0700 Subject: [PATCH 132/247] Refactoring PushEngine to inherit from BaseMemoryEngine. --- src/accl/graph/sega/CoalesceEngine.py | 6 +- src/accl/graph/sega/PushEngine.py | 15 ++- src/accl/graph/sega/WLEngine.py | 11 +- src/accl/graph/sega/base_memory_engine.hh | 20 ++- src/accl/graph/sega/coalesce_engine.hh | 14 --- src/accl/graph/sega/push_engine.cc | 143 +++++++++++----------- src/accl/graph/sega/push_engine.hh | 17 ++- src/accl/graph/sega/wl_engine.cc | 2 +- 8 files changed, 117 insertions(+), 111 deletions(-) diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index 536c3477ae..06c6f92750 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -36,7 +36,7 @@ class CoalesceEngine(BaseMemoryEngine): peer_push_engine = Param.PushEngine(NULL, "PushEngine in the same GPT.") - cache_size = Param.MemorySize("16KiB", "Size of the internal SRAM array.") + cache_size = Param.MemorySize("Size of the internal SRAM array.") - num_mshr_entry = Param.Int(4, "Number of MSHR entries.") - num_tgts_per_mshr = Param.Int(20, "Number of Targets Per MSHR.") + num_mshr_entry = Param.Int("Number of MSHR entries.") + num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.") diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py index d3276799aa..447731219e 100644 --- a/src/accl/graph/sega/PushEngine.py +++ b/src/accl/graph/sega/PushEngine.py @@ -27,13 +27,20 @@ from m5.params import * from m5.proxy import * -from m5.objects.BaseMemEngine import BaseMemEngine +from m5.objects.BaseMemoryEngine import BaseMemoryEngine -class PushEngine(BaseMemEngine): +class PushEngine(BaseMemoryEngine): type = 'PushEngine' cxx_header = "accl/graph/sega/push_engine.hh" cxx_class = 'gem5::PushEngine' req_port = RequestPort("Port to send updates to the outside") - base_edge_addr = Param.Addr("") - push_req_queue_size = Param.Int(0, "") + base_edge_addr = Param.Addr("The base address for the " + "attached edge memory") + push_req_queue_size = Param.Int("Size of the queue to " + "queue push requests.") + # resp_queue_size should probably be + # significantly bigger than push_req_queue_size + resp_queue_size = Param.Int("Size of the response queue in the " + "push engine where it stores the " + "edges read from memory") diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py index cab47fbe7b..98089328f4 100644 --- a/src/accl/graph/sega/WLEngine.py +++ b/src/accl/graph/sega/WLEngine.py @@ -35,6 +35,11 @@ class WLEngine(BaseReduceEngine): cxx_class = 'gem5::WLEngine' resp_port = ResponsePort("Port to Receive updates from outside") - coalesce_engine = Param.CoalesceEngine(NULL, "") - update_queue_size = Param.Int(0, "") - on_the_fly_update_map_size = Param.Int(4, "") # 4 is arbitrary + coalesce_engine = Param.CoalesceEngine(NULL, "The CoalesceEngine " + "this WLEngine is connected to.") + update_queue_size = Param.Int("Size of the queue WLEngine stores " + "the incoming updates") + register_file_size = Param.Int("Number of internal registers the " + "WLEngine has. It can service as " + "many updates as this queueu has " + "entries at the same time.") # 4 is arbitrary diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh index efbfa5312d..5653ede698 100644 --- a/src/accl/graph/sega/base_memory_engine.hh +++ b/src/accl/graph/sega/base_memory_engine.hh @@ -43,7 +43,21 @@ namespace gem5 class BaseMemoryEngine : public ClockedObject { - private: + protected: + class MemoryEvent : public EventFunctionWrapper + { + private: + bool _pending; + public: + MemoryEvent(const std::function &callback, + const std::string &name): + EventFunctionWrapper(callback, name), _pending(false) + {} + bool pending() { return _pending; } + void sleep() { _pending = true; } + void wake() { _pending = false; } + }; + class MemPort : public RequestPort { private: @@ -65,13 +79,11 @@ class BaseMemoryEngine : public ClockedObject virtual void recvReqRetry(); }; - protected: System* system; const RequestorID _requestorId; - AddrRange peerMemoryRange; MemPort memPort; - + AddrRange peerMemoryRange; size_t peerMemoryAtomSize; virtual void recvMemRetry() = 0; diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 28b204e198..b8cac15f5c 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -47,20 +47,6 @@ class WLEngine; class CoalesceEngine : public BaseMemoryEngine { private: - class MemoryEvent : public EventFunctionWrapper - { - private: - bool _pending; - public: - MemoryEvent(const std::function &callback, - const std::string &name): - EventFunctionWrapper(callback, name), _pending(false) - {} - bool pending() { return _pending; } - void sleep() { _pending = true; } - void wake() { _pending = false; } - }; - struct Block { WorkListItem* items; diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index cfebf8e5df..d87462d7dd 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -35,13 +35,15 @@ namespace gem5 { -PushEngine::PushEngine(const PushEngineParams ¶ms): - BaseMemEngine(params), +PushEngine::PushEngine(const Params ¶ms): + BaseMemoryEngine(params), reqPort(name() + ".req_port", this), baseEdgeAddr(params.base_edge_addr), pushReqQueueSize(params.push_req_queue_size), numTotalRetries(0), numPendingRetries(0), - nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()), + onTheFlyMemReqs(0), + memRespQueueSize(params.resp_queue_size), + nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()), nextPushEvent([this] { processNextPushEvent(); }, name()), nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()), stats(*this) @@ -52,10 +54,8 @@ PushEngine::getPort(const std::string &if_name, PortID idx) { if (if_name == "req_port") { return reqPort; - } else if (if_name == "mem_port") { - return BaseMemEngine::getPort(if_name, idx); } else { - return SimObject::getPort(if_name, idx); + return BaseMemoryEngine::getPort(if_name, idx); } } @@ -98,9 +98,9 @@ PushEngine::ReqPort::recvReqRetry() if (!_blocked) { blockedPacket = nullptr; DPRINTF(PushEngine, "%s: Sent the blockedPacket. " - "_blocked: %s, (blockedPacket == nullptr): %s.\n", - __func__, _blocked ? "true" : "false", - (blockedPacket == nullptr) ? "true" : "false"); + "_blocked: %s, (blockedPacket == nullptr): %s.\n", + __func__, _blocked ? "true" : "false", + (blockedPacket == nullptr) ? "true" : "false"); } } @@ -149,14 +149,9 @@ PushEngine::recvWLItem(WorkListItem wl) DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n", __func__, pushReqQueue.size()); - if ((!nextAddrGenEvent.scheduled())) { - if (memQueueFull()) { - if (!pendingMemRetry()) { - requestMemRetry(1); - } - } else { - schedule(nextAddrGenEvent, nextCycle()); - } + if ((!nextMemoryReadEvent.pending()) && + (!nextMemoryReadEvent.scheduled())) { + schedule(nextMemoryReadEvent, nextCycle()); } } @@ -178,67 +173,68 @@ PushEngine::recvWLItemRetry(WorkListItem wl) __func__, pushReqQueue.size()); numTotalRetries--; - if ((!nextAddrGenEvent.scheduled())) { - if (memQueueFull()) { - if (!pendingMemRetry()) { - requestMemRetry(1); - } - } else { - schedule(nextAddrGenEvent, nextCycle()); - } + if ((!nextMemoryReadEvent.pending()) && + (!nextMemoryReadEvent.scheduled())) { + schedule(nextMemoryReadEvent, nextCycle()); } } void -PushEngine::processNextAddrGenEvent() +PushEngine::processNextMemoryReadEvent() { - Addr aligned_addr, offset; - int num_edges; - - PushPacketInfoGen &curr_info = pushReqQueue.front(); - std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo(); - DPRINTF(PushEngine, "%s: Current packet information generated by " - "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, " - "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges); - - PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize); - reqOffsetMap[pkt->req] = offset; - reqNumEdgeMap[pkt->req] = num_edges; - reqValueMap[pkt->req] = curr_info.value(); - - enqueueMemReq(pkt); - - if (curr_info.done()) { - DPRINTF(PushEngine, "%s: Current PushPacketInfoGen is done.\n", __func__); - pushReqQueue.pop_front(); - DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. " - "pushReqQueue.size() = %u.\n", - __func__, pushReqQueue.size()); - if (numTotalRetries > 0) { - int free_space = pushReqQueueSize - - (pushReqQueue.size() + (numPendingRetries * numElementsPerLine)); - DPRINTF(PushEngine, "%s: pushReqQueue has at least %d" - " free spaces.\n", __func__, free_space); - if ((free_space >= numElementsPerLine) && - (numPendingRetries == 0)) { - DPRINTF(PushEngine, "%s: Sent a push retry to " - "peerCoalesceEngine.\n", __func__); - if (!nextSendRetryEvent.scheduled()) { - schedule(nextSendRetryEvent, nextCycle()); - } - } - } + if (memPort.blocked()) { + nextMemoryReadEvent.sleep(); + return; } - if (memQueueFull()) { - if (!pushReqQueue.empty()) { - requestMemRetry(1); + if (memRespQueue.size() < (memRespQueueSize - onTheFlyMemReqs)) { + Addr aligned_addr, offset; + int num_edges; + + PushPacketInfoGen &curr_info = pushReqQueue.front(); + std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo(); + DPRINTF(PushEngine, "%s: Current packet information generated by " + "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, " + "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges); + + PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize); + reqOffsetMap[pkt->req] = offset; + reqNumEdgeMap[pkt->req] = num_edges; + reqValueMap[pkt->req] = curr_info.value(); + + memPort.sendPacket(pkt); + onTheFlyMemReqs++; + + if (curr_info.done()) { + DPRINTF(PushEngine, "%s: Current PushPacketInfoGen is done.\n", __func__); + pushReqQueue.pop_front(); + DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. " + "pushReqQueue.size() = %u.\n", + __func__, pushReqQueue.size()); + if (numTotalRetries > 0) { + int free_space = pushReqQueueSize - + (pushReqQueue.size() + (numPendingRetries * numElementsPerLine)); + DPRINTF(PushEngine, "%s: pushReqQueue has at least %d" + " free spaces.\n", __func__, free_space); + if ((free_space >= numElementsPerLine) && + (numPendingRetries == 0)) { + DPRINTF(PushEngine, "%s: Sent a push retry to " + "peerCoalesceEngine.\n", __func__); + if (!nextSendRetryEvent.scheduled()) { + schedule(nextSendRetryEvent, nextCycle()); + } + } + } } - return; } - if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) { - schedule(nextAddrGenEvent, nextCycle()); + // if ((!nextMemoryReadEvent.scheduled()) && (!pushReqQueue.empty())) { + // schedule(nextMemoryReadEvent, nextCycle()); + // } + if (!pushReqQueue.empty()) { + assert(!nextMemoryReadEvent.pending()); + assert(!nextMemoryReadEvent.scheduled()); + schedule(nextMemoryReadEvent, nextCycle()); } } @@ -253,9 +249,11 @@ PushEngine::processNextSendRetryEvent() void PushEngine::recvMemRetry() { - assert(!nextAddrGenEvent.scheduled()); - DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__); - schedule(nextAddrGenEvent, nextCycle()); + if (nextMemoryReadEvent.pending()) { + DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__); + nextMemoryReadEvent.wake(); + schedule(nextMemoryReadEvent, nextCycle()); + } } bool @@ -264,7 +262,8 @@ PushEngine::handleMemResp(PacketPtr pkt) // TODO: in case we need to edit edges, get rid of second statement. assert(pkt->isResponse() && (!pkt->isWrite())); memRespQueue.push_back(pkt); - assert(memRespQueue.size() <= respQueueSize); + onTheFlyMemReqs--; + assert(memRespQueue.size() <= memRespQueueSize); if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) { schedule(nextPushEvent, nextCycle()); diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 378cd1a487..9b182e2251 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -29,7 +29,7 @@ #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ -#include "accl/graph/base/base_mem_engine.hh" +#include "accl/graph/sega/base_memory_engine.hh" #include "accl/graph/base/data_structs.hh" #include "base/intmath.hh" #include "params/PushEngine.hh" @@ -39,7 +39,7 @@ namespace gem5 class CoalesceEngine; -class PushEngine : public BaseMemEngine +class PushEngine : public BaseMemoryEngine { private: class PushPacketInfoGen { @@ -115,15 +115,14 @@ class PushEngine : public BaseMemEngine std::unordered_map reqNumEdgeMap; std::unordered_map reqValueMap; - // Since the push engine can process incoming packets faster than - // memory can send those packets, the size of this queue will - // always be limited by the b/w of the memory. + int onTheFlyMemReqs; + int memRespQueueSize; std::deque memRespQueue; template PacketPtr createUpdatePacket(Addr addr, T value); - EventFunctionWrapper nextAddrGenEvent; - void processNextAddrGenEvent(); + MemoryEvent nextMemoryReadEvent; + void processNextMemoryReadEvent(); EventFunctionWrapper nextPushEvent; void processNextPushEvent(); @@ -145,13 +144,12 @@ class PushEngine : public BaseMemEngine PushStats stats; protected: - virtual int respBuffSize() { return memRespQueue.size(); } virtual void recvMemRetry(); virtual bool handleMemResp(PacketPtr pkt); public: PARAMS(PushEngine); - PushEngine(const PushEngineParams ¶ms); + PushEngine(const Params ¶ms); Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; @@ -169,7 +167,6 @@ class PushEngine : public BaseMemEngine int getNumRetries() { return numTotalRetries; } - void recvRetryReject() { numPendingRetries--; } }; } diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 2d4ffc9cac..12f4548aa2 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -40,7 +40,7 @@ WLEngine::WLEngine(const WLEngineParams ¶ms): respPort(name() + ".resp_port", this), coalesceEngine(params.coalesce_engine), updateQueueSize(params.update_queue_size), - registerFileSize(params.on_the_fly_update_map_size), + registerFileSize(params.register_file_size), nextReadEvent([this]{ processNextReadEvent(); }, name()), nextReduceEvent([this]{ processNextReduceEvent(); }, name()), stats(*this) From b7e76bfdb113a55311db67e0532495e958b4794b Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 26 Jul 2022 09:01:42 -0700 Subject: [PATCH 133/247] Refactored PushEngine to inherit from BaseMemoryEngine. --- src/accl/graph/SConscript | 4 +- src/accl/graph/base/BaseMemEngine.py | 47 --- src/accl/graph/base/SConscript | 3 - src/accl/graph/base/base_mem_engine.cc | 225 -------------- src/accl/graph/base/base_mem_engine.hh | 125 -------- src/accl/graph/sega/base_memory_engine.cc | 4 + src/accl/graph/sega/base_memory_engine.hh | 7 +- src/accl/graph/sega/coalesce_engine.cc | 362 +++++++++++++++------- src/accl/graph/sega/coalesce_engine.hh | 9 +- 9 files changed, 275 insertions(+), 511 deletions(-) delete mode 100644 src/accl/graph/base/BaseMemEngine.py delete mode 100644 src/accl/graph/base/base_mem_engine.cc delete mode 100644 src/accl/graph/base/base_mem_engine.hh diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript index 53c6411de6..5dffd1a396 100644 --- a/src/accl/graph/SConscript +++ b/src/accl/graph/SConscript @@ -28,5 +28,5 @@ Import('*') DebugFlag('SEGAStructureSize') -CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine', - 'BaseMemEngine', 'BaseMemoryEngine']) +CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', + 'WLEngine', 'BaseMemoryEngine']) diff --git a/src/accl/graph/base/BaseMemEngine.py b/src/accl/graph/base/BaseMemEngine.py deleted file mode 100644 index 2ecb6659d8..0000000000 --- a/src/accl/graph/base/BaseMemEngine.py +++ /dev/null @@ -1,47 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (c) 2017 Jason Lowe-Power -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from m5.params import * -from m5.proxy import * -from m5.objects.ClockedObject import ClockedObject - -class BaseMemEngine(ClockedObject): - abstract = True - type = 'BaseMemEngine' - cxx_header = "accl/graph/base/base_mem_engine.hh" - cxx_class = 'gem5::BaseMemEngine' - - system = Param.System(Parent.any, 'System this Engine is a part of') - mem_port = RequestPort("Port to communicate with the memory") - - outstanding_mem_req_queue_size = Param.Int(16, "Capacity of queue in " - "which memory requests are queued.") - - attached_memory_atom_size = Param.Int(64, "The atom size of the attached " - "memory.") - - resp_queue_size = Param.Int(64, "blah") diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript index 45877a12ca..0e43d1aed8 100644 --- a/src/accl/graph/base/SConscript +++ b/src/accl/graph/base/SConscript @@ -27,9 +27,6 @@ Import('*') -SimObject('BaseMemEngine.py') SimObject('BaseReduceEngine.py') -Source('base_mem_engine.cc') Source('base_reduce_engine.cc') -DebugFlag('BaseMemEngine') diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc deleted file mode 100644 index 590307b2bc..0000000000 --- a/src/accl/graph/base/base_mem_engine.cc +++ /dev/null @@ -1,225 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "accl/graph/base/base_mem_engine.hh" - -#include "debug/BaseMemEngine.hh" -#include "debug/SEGAStructureSize.hh" - -namespace gem5 -{ - -BaseMemEngine::BaseMemEngine(const BaseMemEngineParams ¶ms): - ClockedObject(params), - system(params.system), - memPort(name() + ".mem_port", this), - memQueueSize(params.outstanding_mem_req_queue_size), - onTheFlyReqs(0), - memRetryRequested(false), - memSpaceRequested(0), - nextMemReqEvent([this] { processNextMemReqEvent(); }, name()), - respQueueSize(params.resp_queue_size), - _requestorId(system->getRequestorId(this)), - peerMemoryAtomSize(params.attached_memory_atom_size) -{} - -BaseMemEngine::~BaseMemEngine() -{} - -Port& -BaseMemEngine::getPort(const std::string &if_name, PortID idx) -{ - if (if_name == "mem_port") { - return memPort; - } else { - return SimObject::getPort(if_name, idx); - } -} - -void -BaseMemEngine::MemPort::sendPacket(PacketPtr pkt) -{ - panic_if(_blocked, "Should never try to send if blocked MemSide!"); - // If we can't send the packet across the port, store it for later. - if (!sendTimingReq(pkt)) - { - blockedPacket = pkt; - _blocked = true; - } -} - -bool -BaseMemEngine::MemPort::recvTimingResp(PacketPtr pkt) -{ - //TODO: Investigate sending true all the time - return owner->recvTimingResp(pkt); -} - -void -BaseMemEngine::MemPort::recvReqRetry() -{ - panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); - - _blocked = false; - sendPacket(blockedPacket); - - if (!blocked()) { - blockedPacket = nullptr; - } - - owner->wakeUp(); -} - -void -BaseMemEngine::processNextMemReqEvent() -{ - if ((respQueueSize == 0) || - ((respBuffSize() + onTheFlyReqs) < respQueueSize)) { - PacketPtr pkt = memQueue.front(); - memPort.sendPacket(pkt); - onTheFlyReqs++; - DPRINTF(BaseMemEngine, "%s: Sent a packet to memory with the following info. " - "pkt->addr: %lu, pkt->size: %lu.\n", - __func__, pkt->getAddr(), pkt->getSize()); - memQueue.pop_front(); - DPRINTF(SEGAStructureSize, "%s: Popped pkt: %s from " - "memQueue. memQueue.size = %d, memQueueSize = %d.\n", - __func__, pkt->print(), memQueue.size(), memQueueSize); - DPRINTF(BaseMemEngine, "%s: Popped pkt: %s from " - "memQueue. memQueue.size = %d, memQueueSize = %d.\n", - __func__, pkt->print(), memQueue.size(), memQueueSize); - if (memRetryRequested && - (memQueue.size() <= - (memQueueSize - memSpaceRequested))) { - memRetryRequested = false; - memSpaceRequested = 0; - recvMemRetry(); - } - } - - if ((!memPort.blocked()) && - (!memQueue.empty()) && (!nextMemReqEvent.scheduled())) { - schedule(nextMemReqEvent, nextCycle()); - } -} - -PacketPtr -BaseMemEngine::createReadPacket(Addr addr, unsigned int size) -{ - RequestPtr req = std::make_shared(addr, size, 0, _requestorId); - // Dummy PC to have PC-based prefetchers latch on; get entropy into higher - // bits - req->setPC(((Addr) _requestorId) << 2); - - // Embed it in a packet - PacketPtr pkt = new Packet(req, MemCmd::ReadReq); - pkt->allocate(); - - return pkt; -} - -PacketPtr -BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data) -{ - RequestPtr req = std::make_shared(addr, size, 0, _requestorId); - - // Dummy PC to have PC-based prefetchers latch on; get entropy into higher - // bits - req->setPC(((Addr) _requestorId) << 2); - - PacketPtr pkt = new Packet(req, MemCmd::WriteReq); - pkt->allocate(); - pkt->setData(data); - - return pkt; -} - -bool -BaseMemEngine::allocateMemQueueSpace(int space) -{ - assert((memQueueSize == 0) || - (memQueue.size() <= memQueueSize)); - return ( - (memQueueSize == 0) || - (memQueue.size() <= (memQueueSize - space)) - ); -} - -bool -BaseMemEngine::memQueueFull() -{ - assert((memQueueSize == 0) || - (memQueue.size() <= memQueueSize)); - return ( - (memQueueSize != 0) && - (memQueue.size() == memQueueSize)); -} - -void -BaseMemEngine::enqueueMemReq(PacketPtr pkt) -{ - panic_if(memQueueFull(), "Should not enqueue if queue full.\n"); - memQueue.push_back(pkt); - DPRINTF(SEGAStructureSize, "%s: Pushed pkt: %s to memQueue. " - "memQueue.size = %d, memQueueSize = %d.\n", __func__, - pkt->print(), memQueue.size(), memQueueSize); - DPRINTF(BaseMemEngine, "%s: Pushed pkt: %s to memQueue. " - "memQueue.size = %d, memQueueSize = %d.\n", __func__, - pkt->print(), memQueue.size(), memQueueSize); - if ((!nextMemReqEvent.scheduled()) && (!memPort.blocked())) { - schedule(nextMemReqEvent, nextCycle()); - } -} - -void -BaseMemEngine::requestMemRetry(int space) { - panic_if((memRetryRequested == true) || (memSpaceRequested != 0), - "You should not request another alarm without the first one being" - "responded to.\n"); - DPRINTF(BaseMemEngine, "%s: Alarm requested with space = %d.\n", __func__, space); - memRetryRequested = true; - memSpaceRequested = space; -} - -void -BaseMemEngine::wakeUp() -{ - assert(!nextMemReqEvent.scheduled()); - if (!memQueue.empty()) { - schedule(nextMemReqEvent, nextCycle()); - } -} - -bool -BaseMemEngine::recvTimingResp(PacketPtr pkt) -{ - onTheFlyReqs--; - return handleMemResp(pkt); -} - -} diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh deleted file mode 100644 index 01c862d555..0000000000 --- a/src/accl/graph/base/base_mem_engine.hh +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__ -#define __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__ - -#include - -#include "base/addr_range.hh" -#include "mem/packet.hh" -#include "mem/port.hh" -#include "params/BaseMemEngine.hh" -#include "sim/clocked_object.hh" -#include "sim/system.hh" - -namespace gem5 -{ - -class BaseMemEngine : public ClockedObject -{ - private: - class MemPort : public RequestPort - { - private: - BaseMemEngine* owner; - bool _blocked; - PacketPtr blockedPacket; - - public: - MemPort(const std::string& name, BaseMemEngine* owner): - RequestPort(name, owner), owner(owner), - _blocked(false), blockedPacket(nullptr) - {} - - void sendPacket(PacketPtr pkt); - bool blocked() { return _blocked; } - - protected: - virtual bool recvTimingResp(PacketPtr pkt); - virtual void recvReqRetry(); - }; - - System* system; - MemPort memPort; - - int memQueueSize; - int onTheFlyReqs; - bool memRetryRequested; - int memSpaceRequested; - std::deque memQueue; - - EventFunctionWrapper nextMemReqEvent; - void processNextMemReqEvent(); - - protected: - - int respQueueSize; - const RequestorID _requestorId; - - size_t peerMemoryAtomSize; - - bool allocateMemQueueSpace(int space); - bool memQueueFull(); - - bool pendingMemRetry() { return memRetryRequested; } - void requestMemRetry(int space); - - void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } - void enqueueMemReq(PacketPtr pkt); - - virtual int respBuffSize() = 0; - virtual void recvMemRetry() = 0; - virtual bool handleMemResp(PacketPtr pkt) = 0; - - PacketPtr createReadPacket(Addr addr, unsigned int size); - PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data); - - public: - PARAMS(BaseMemEngine); - - BaseMemEngine(const BaseMemEngineParams ¶ms); - ~BaseMemEngine(); - - Port& getPort(const std::string &if_name, - PortID idx=InvalidPortID) override; - - RequestorID requestorId() { return _requestorId; } - - AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); } - - bool recvTimingResp(PacketPtr pkt); - void recvFunctional(PacketPtr pkt); - - void wakeUp(); - -}; - -} - -#endif // __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__ diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc index 9db95d6bd6..c60d189e0f 100644 --- a/src/accl/graph/sega/base_memory_engine.cc +++ b/src/accl/graph/sega/base_memory_engine.cc @@ -77,7 +77,11 @@ BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt) { blockedPacket = pkt; _blocked = true; + DPRINTF(BaseMemoryEngine, "%s: MemPort blocked. blockedPacket %s.\n", + __func__, blockedPacket->print()); } else { + DPRINTF(BaseMemoryEngine, "%s: Packet %s sent successfully.\n", + __func__, pkt->print()); owner->recvMemRetry(); } } diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh index 5653ede698..f336edcbf1 100644 --- a/src/accl/graph/sega/base_memory_engine.hh +++ b/src/accl/graph/sega/base_memory_engine.hh @@ -48,14 +48,19 @@ class BaseMemoryEngine : public ClockedObject { private: bool _pending; + int _prevState; + public: MemoryEvent(const std::function &callback, const std::string &name): - EventFunctionWrapper(callback, name), _pending(false) + EventFunctionWrapper(callback, name), + _pending(false), _prevState(0) {} bool pending() { return _pending; } void sleep() { _pending = true; } void wake() { _pending = false; } + void setPrevState(int state) { _prevState = state; } + int getPrevState() { return _prevState; } }; class MemPort : public RequestPort diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 21f048213a..daaed28f1c 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -28,6 +28,8 @@ #include "accl/graph/sega/coalesce_engine.hh" +#include + #include "accl/graph/sega/wl_engine.hh" #include "base/intmath.hh" #include "debug/ApplyUpdates.hh" @@ -53,7 +55,7 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): nextRespondEvent([this] { processNextRespondEvent(); }, name()), nextApplyEvent([this] { processNextApplyEvent(); }, name()), nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()), - nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()), + nextRecvPushRetryEvent([this] { processNextRecvPushRetryEvent(); }, name()), stats(*this) { assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine)); @@ -317,6 +319,10 @@ CoalesceEngine::processNextMemoryReadEvent() pendingEventQueue.push_back("nextMemoryReadEvent"); // Maximum three MemoryEvents. assert(pendingEventQueue.size() <= 3); + DPRINTF(CoalesceEngine, "%s: nextMemoryReadEvent is asleep now and " + "has been pushed to pendingEventQueue. " + "pendingEventQueue.size = %d.\n", + __func__, pendingEventQueue.size()); return; } @@ -366,11 +372,14 @@ CoalesceEngine::processNextRespondEvent() void CoalesceEngine::recvMemRetry() { + DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__); if (pendingEventQueue.empty()) { + DPRINTF(CoalesceEngine, "%s: No events pending.\n", __func__); return; } std::string front = pendingEventQueue.front(); + DPRINTF(CoalesceEngine, "%s: %s is pending MemRetry.\n", __func__, front); if (front == "nextMemoryReadEvent") { assert(!nextMemoryReadEvent.scheduled()); @@ -382,11 +391,11 @@ CoalesceEngine::recvMemRetry() assert(nextWriteBackEvent.pending()); schedule(nextWriteBackEvent, nextCycle()); nextWriteBackEvent.wake(); - } else if (front == "nextSendRetryEvent") { - assert(!nextSendRetryEvent.scheduled()); - assert(nextSendRetryEvent.pending()); - schedule(nextSendRetryEvent, nextCycle()); - nextSendRetryEvent.wake(); + } else if (front == "nextRecvPushRetryEvent") { + assert(!nextRecvPushRetryEvent.scheduled()); + assert(nextRecvPushRetryEvent.pending()); + schedule(nextRecvPushRetryEvent, nextCycle()); + nextRecvPushRetryEvent.wake(); } else { panic("EVENT IS NOT RECOGNIZED.\n"); } @@ -642,14 +651,16 @@ CoalesceEngine::processNextApplyEvent() int block_index = applyQueue.front(); if (cacheBlocks[block_index].busyMask != 0) { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been taken amid apply process. " - "Therefore, ignoring the apply schedule.\n", + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been taken amid " + "apply process. Therefore, ignoring the apply schedule.\n", __func__, block_index); stats.falseApplySchedules++; } else if (!cacheBlocks[block_index].dirty) { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has no change. Therefore, no apply " - "needed.\n", __func__, block_index); + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has no change. " + "Therefore, no apply needed.\n", __func__, block_index); } else { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] could be applied.\n", + __func__, block_index); for (int i = 0; i < numElementsPerLine; i++) { uint32_t old_prop = cacheBlocks[block_index].items[i].prop; uint32_t new_prop = std::min( @@ -683,8 +694,9 @@ CoalesceEngine::processNextApplyEvent() if (cacheBlocks[block_index].hasConflict){ writeBackQueue.push_back(block_index); assert(writeBackQueue.size() <= numLines); - DPRINTF(CoalesceEngine, "%s: Added %d to writeBackQueue. writeBackQueue.size = %u.\n", - __func__, block_index, writeBackQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added %d to writeBackQueue. " + "writeBackQueue.size = %u.\n", __func__, + block_index, writeBackQueue.size()); } applyQueue.pop_front(); @@ -710,6 +722,10 @@ CoalesceEngine::processNextWriteBackEvent() pendingEventQueue.push_back("nextWriteBackEvent"); // Maximum three MemoryEvent. assert(pendingEventQueue.size() <= 3); + DPRINTF(CoalesceEngine, "%s: nextWriteBackEvent is asleep now and " + "has been pushed to pendingEventQueue. " + "pendingEventQueue.size = %d.\n", + __func__, pendingEventQueue.size()); return; } @@ -774,121 +790,259 @@ void CoalesceEngine::recvPushRetry() { numRetriesReceived++; + DPRINTF(CoalesceEngine, "%s: Received a push retry.\n", __func__); // For now since we do only one retry at a time, we should not receive // a retry while this nextSendingRetryEvent is scheduled or is pending. - assert(!nextSendRetryEvent.pending()); - assert(!nextSendRetryEvent.scheduled()); + assert(!nextRecvPushRetryEvent.pending()); + assert(!nextRecvPushRetryEvent.scheduled()); assert(numRetriesReceived == 1); - schedule(nextSendRetryEvent, nextCycle()); + schedule(nextRecvPushRetryEvent, nextCycle()); } -void -CoalesceEngine::processNextSendRetryEvent() +// void +// CoalesceEngine::processNextRecvPushRetryEvent() +// { +// assert(!nextRecvPushRetryEvent.pending()); +// assert(needsPush.count() != 0); + +// Addr block_addr = 0; +// int block_index = 0; +// int it = 0; +// uint32_t slice = 0; +// bool hit_in_cache = false; + +// for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE; +// it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE) { +// for (int i = 0; i < numElementsPerLine; i++) { +// slice <<= 1; +// slice |= needsPush[it + i]; +// } +// if (slice) { +// block_addr = getBlockAddrFromBitIndex(it); +// block_index = getBlockIndex(block_addr); +// if ((cacheBlocks[block_index].addr == block_addr) && +// (cacheBlocks[block_index].valid)) { +// if (cacheBlocks[block_index].busyMask == 0) { +// hit_in_cache = true; +// break; +// } +// } else { +// hit_in_cache = false; +// break; +// } +// } +// } + +// assert(it < MAX_BITVECTOR_SIZE); +// if ((it + numElementsPerLine) > MAX_BITVECTOR_SIZE) { +// currentBitSliceIndex = 0; +// } else { +// currentBitSliceIndex = it + numElementsPerLine; +// } + +// DPRINTF(CoalesceEngine, "%s: Found slice with value %d at position %d " +// "in needsPush.\n", __func__, slice, it); + +// if (hit_in_cache) { +// int push_needed = 0; +// DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", +// __func__, needsPush.count()); +// assert(peerPushEngine->getNumRetries() == needsPush.count()); +// for (int i = 0; i < numElementsPerLine; i++) { +// // TODO: Make this more programmable +// uint32_t new_prop = std::min( +// cacheBlocks[block_index].items[i].prop, +// cacheBlocks[block_index].items[i].tempProp); +// cacheBlocks[block_index].items[i].tempProp = new_prop; +// cacheBlocks[block_index].items[i].prop = new_prop; +// if (needsPush[it + i] == 1) { +// peerPushEngine->recvWLItemRetry( +// cacheBlocks[block_index].items[i]); +// } +// push_needed += needsPush[it + i]; +// needsPush[it + i] = 0; +// } +// DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", +// __func__, needsPush.count()); +// peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed); +// assert(peerPushEngine->getNumRetries() == needsPush.count()); +// if (applyQueue.find(block_index)) { +// applyQueue.erase(block_index); +// if (applyQueue.empty() && nextApplyEvent.scheduled()) { +// deschedule(nextApplyEvent); +// } +// if (cacheBlocks[block_index].hasConflict) { +// writeBackQueue.push_back(block_index); +// assert(writeBackQueue.size() <= numLines); +// if ((!writeBackQueue.empty()) && +// (!nextWriteBackEvent.pending()) && +// (!nextWriteBackEvent.scheduled())) { +// schedule(nextWriteBackEvent, nextCycle()); +// } +// } +// } +// } else { +// if (memPort.blocked()) { +// nextRecvPushRetryEvent.sleep(); +// pendingEventQueue.push_back("nextRecvPushRetryEvent"); +// // Maximum three MemoryEvent. +// assert(pendingEventQueue.size() <= 3); +// return; +// } + +// // FIXME: Fix the retry mechanism between memory and cache to +// // handle memory retries correctly. This probably requires scheduling +// // an event for sending the retry. For now we're enabling infinite +// // queueing in the memQueue. +// // FIXME: Also do not send requests for cache lines that are already +// // read but await data. Just set a flag or sth. +// PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize); +// SenderState* sender_state = new SenderState(true); +// pkt->pushSenderState(sender_state); +// memPort.sendPacket(pkt); +// } + +// numRetriesReceived--; +// assert(numRetriesReceived == 0); +// assert(!nextRecvPushRetryEvent.scheduled()); +// } + +std::tuple +CoalesceEngine::getOptimalBitVectorSlice() { - assert(!nextSendRetryEvent.pending()); - assert(needsPush.count() != 0); + bool hit_in_cache; + int slice_base = -1; - DPRINTF(CoalesceEngine, "%s: Received a push retry.\n", __func__); - Addr block_addr = 0; - int block_index = 0; - int it = 0; - uint32_t slice = 0; - bool hit_in_cache = false; - - for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE; - it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE) { + int score = 0; + uint32_t current_popcount = 0; + for (int it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) { + int current_score = 0; for (int i = 0; i < numElementsPerLine; i++) { - slice <<= 1; - slice |= needsPush[it + i]; + current_popcount += needsPush[it + i]; } - if (slice) { - block_addr = getBlockAddrFromBitIndex(it); - block_index = getBlockIndex(block_addr); - if ((cacheBlocks[block_index].addr == block_addr) && - (cacheBlocks[block_index].valid)) { - if (cacheBlocks[block_index].busyMask == 0) { - hit_in_cache = true; - break; - } - } else { + if (current_popcount == 0) { + continue; + } + current_score += current_popcount; + Addr addr = getBlockAddrFromBitIndex(it); + int block_index = getBlockIndex(addr); + if ((cacheBlocks[block_index].valid) && + (cacheBlocks[block_index].addr == addr) && + (cacheBlocks[block_index].busyMask == 0)) { + current_score += numElementsPerLine * 2; + if (current_score > score) { + score = current_score; + slice_base = it; + hit_in_cache = true; + } + } else if (!((cacheBlocks[block_index].addr == addr) && + (cacheBlocks[block_index].allocated))) { + score += numElementsPerLine; + if (current_score > score) { + score = current_score; + slice_base = it; hit_in_cache = false; - break; } } } - assert(it < MAX_BITVECTOR_SIZE); - if ((it + numElementsPerLine) > MAX_BITVECTOR_SIZE) { - currentBitSliceIndex = 0; - } else { - currentBitSliceIndex = it + numElementsPerLine; - } + return std::make_tuple(hit_in_cache, slice_base); +} + +void +CoalesceEngine::processNextRecvPushRetryEvent() +{ + bool hit_in_cache; + int slice_base; + std::tie(hit_in_cache, slice_base)= getOptimalBitVectorSlice(); - DPRINTF(CoalesceEngine, "%s: Found slice with value %d at position %d " - "in needsPush.\n", __func__, slice, it); + if (slice_base != -1) { + Addr addr = getBlockAddrFromBitIndex(slice_base); + int block_index = getBlockIndex(addr); + if (hit_in_cache) { + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].busyMask == 0); + + // if nextRecvPushRetryEvent has been blocked by memory before + if (nextRecvPushRetryEvent.getPrevState() == -1) { + DPRINTF(CoalesceEngine, "%s: nextRecvPushRetry passing " + "its MemRetry.\n", __func__); + recvMemRetry(); + nextRecvPushRetryEvent.setPrevState(0); + } - if (hit_in_cache) { - int push_needed = 0; - DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", - __func__, needsPush.count()); - assert(peerPushEngine->getNumRetries() == needsPush.count()); - for (int i = 0; i < numElementsPerLine; i++) { - // TODO: Make this more programmable - uint32_t new_prop = std::min( + int push_needed = 0; + DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", + __func__, needsPush.count()); + assert(peerPushEngine->getNumRetries() == needsPush.count()); + + for (int i = 0; i < numElementsPerLine; i++) { + // TODO: Make this more programmable + uint32_t new_prop = std::min( cacheBlocks[block_index].items[i].prop, cacheBlocks[block_index].items[i].tempProp); - cacheBlocks[block_index].items[i].tempProp = new_prop; - cacheBlocks[block_index].items[i].prop = new_prop; - if (needsPush[it + i] == 1) { - peerPushEngine->recvWLItemRetry( - cacheBlocks[block_index].items[i]); - } - push_needed += needsPush[it + i]; - needsPush[it + i] = 0; - } - DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", - __func__, needsPush.count()); - peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed); - assert(peerPushEngine->getNumRetries() == needsPush.count()); - if (applyQueue.find(block_index)) { - applyQueue.erase(block_index); - if (applyQueue.empty() && nextApplyEvent.scheduled()) { - deschedule(nextApplyEvent); + cacheBlocks[block_index].items[i].tempProp = new_prop; + cacheBlocks[block_index].items[i].prop = new_prop; + if (needsPush[slice_base + i] == 1) { + peerPushEngine->recvWLItemRetry( + cacheBlocks[block_index].items[i]); + } + push_needed += needsPush[slice_base + i]; + needsPush[slice_base + i] = 0; } - if (cacheBlocks[block_index].hasConflict) { - writeBackQueue.push_back(block_index); - assert(writeBackQueue.size() <= numLines); - if ((!writeBackQueue.empty()) && - (!nextWriteBackEvent.pending()) && - (!nextWriteBackEvent.scheduled())) { - schedule(nextWriteBackEvent, nextCycle()); + DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", + __func__, needsPush.count()); + peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed); + assert(peerPushEngine->getNumRetries() == needsPush.count()); + if (applyQueue.find(block_index)) { + applyQueue.erase(block_index); + if (applyQueue.empty() && nextApplyEvent.scheduled()) { + deschedule(nextApplyEvent); + } + if (cacheBlocks[block_index].hasConflict) { + writeBackQueue.push_back(block_index); + assert(writeBackQueue.size() <= numLines); + if ((!nextWriteBackEvent.pending()) && + (!nextWriteBackEvent.scheduled())) { + schedule(nextWriteBackEvent, nextCycle()); + } } } - } - } else { - if (memPort.blocked()) { - nextSendRetryEvent.sleep(); - pendingEventQueue.push_back("nextSendRetryEvent"); - // Maximum three MemoryEvent. - assert(pendingEventQueue.size() <= 3); - return; - } + } else { + if (memPort.blocked()) { + assert(nextRecvPushRetryEvent.getPrevState() != -1); + nextRecvPushRetryEvent.setPrevState(-1); + nextRecvPushRetryEvent.sleep(); + pendingEventQueue.push_back("nextRecvPushRetryEvent"); + assert(pendingEventQueue.size() <= 3); + DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is asleep now " + "and has been pushed to pendingEventQueue." + " pendingEventQueue.size = %d.\n", + __func__, pendingEventQueue.size()); + return; + } + // if nextRecvPushRetryEvent has been blocked by memory before + if (nextRecvPushRetryEvent.getPrevState() == -1) { + DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is " + "unblocked by memPort. Setting prevState to 0.\n", __func__); + nextRecvPushRetryEvent.setPrevState(0); + } - // FIXME: Fix the retry mechanism between memory and cache to - // handle memory retries correctly. This probably requires scheduling - // an event for sending the retry. For now we're enabling infinite - // queueing in the memQueue. - // FIXME: Also do not send requests for cache lines that are already - // read but await data. Just set a flag or sth. - PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize); - SenderState* sender_state = new SenderState(true); - pkt->pushSenderState(sender_state); - memPort.sendPacket(pkt); + PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); + SenderState* sender_state = new SenderState(true); + pkt->pushSenderState(sender_state); + memPort.sendPacket(pkt); + // TODO: Set a tracking structure so that nextMemoryReadEvent knows + // It does not have to read this address anymore. It can simply set + // a flag to true (maybe not even needed just look if the cache has a + // line allocated for it in the cacheBlocks). + } + numRetriesReceived--; + assert(numRetriesReceived == 0); + } + if (numRetriesReceived > 0) { + schedule(nextRecvPushRetryEvent, nextCycle()); } - - numRetriesReceived--; - assert(numRetriesReceived == 0); - assert(!nextSendRetryEvent.scheduled()); } CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index b8cac15f5c..356fee0107 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -106,6 +106,7 @@ class CoalesceEngine : public BaseMemoryEngine int getBlockIndex(Addr addr); int getBitIndexBase(Addr addr); Addr getBlockAddrFromBitIndex(int index); + std::tuple getOptimalBitVectorSlice(); std::deque pendingEventQueue; @@ -121,8 +122,8 @@ class CoalesceEngine : public BaseMemoryEngine MemoryEvent nextWriteBackEvent; void processNextWriteBackEvent(); - MemoryEvent nextSendRetryEvent; - void processNextSendRetryEvent(); + MemoryEvent nextRecvPushRetryEvent; + void processNextRecvPushRetryEvent(); struct CoalesceStats : public statistics::Group { @@ -145,8 +146,8 @@ class CoalesceEngine : public BaseMemoryEngine CoalesceStats stats; protected: - virtual void recvMemRetry(); - virtual bool handleMemResp(PacketPtr pkt); + virtual void recvMemRetry() override; + virtual bool handleMemResp(PacketPtr pkt) override; public: PARAMS(CoalesceEngine); From 0fc5c5efb512183db2b35cc30217555073973296 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 26 Jul 2022 09:49:11 -0700 Subject: [PATCH 134/247] Making bit vector smaller and choosing slices faster. --- src/accl/graph/sega/coalesce_engine.cc | 7 ++++++- src/accl/graph/sega/coalesce_engine.hh | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index daaed28f1c..f86d6877ad 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -914,9 +914,10 @@ CoalesceEngine::getOptimalBitVectorSlice() int slice_base = -1; int score = 0; - uint32_t current_popcount = 0; + int max_score_possible = 3 * numElementsPerLine; for (int it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) { int current_score = 0; + uint32_t current_popcount = 0; for (int i = 0; i < numElementsPerLine; i++) { current_popcount += needsPush[it + i]; } @@ -934,6 +935,9 @@ CoalesceEngine::getOptimalBitVectorSlice() score = current_score; slice_base = it; hit_in_cache = true; + if (score == max_score_possible) { + break; + } } } else if (!((cacheBlocks[block_index].addr == addr) && (cacheBlocks[block_index].allocated))) { @@ -942,6 +946,7 @@ CoalesceEngine::getOptimalBitVectorSlice() score = current_score; slice_base = it; hit_in_cache = false; + assert(score < max_score_possible); } } } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 356fee0107..f6ed4843fa 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -37,7 +37,7 @@ #include "base/statistics.hh" #include "params/CoalesceEngine.hh" -#define MAX_BITVECTOR_SIZE (1 << 30) +#define MAX_BITVECTOR_SIZE (1 << 28) namespace gem5 { From ef61dcfccf1e22ea364b6ce13437c9ea9676fceb Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 28 Jul 2022 06:36:15 -0700 Subject: [PATCH 135/247] Merging all memory interactions into one event. --- src/accl/graph/sega/coalesce_engine.cc | 559 +++++++++++-------------- src/accl/graph/sega/coalesce_engine.hh | 24 +- 2 files changed, 255 insertions(+), 328 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index f86d6877ad..4d7107274b 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -45,17 +45,15 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): peerPushEngine(params.peer_push_engine), numLines((int) (params.cache_size / peerMemoryAtomSize)), numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), - numMSHREntries(params.num_mshr_entry), - numTgtsPerMSHR(params.num_tgts_per_mshr), - currentBitSliceIndex(0), - numRetriesReceived(0), - applyQueue(numLines), - writeBackQueue(numLines), - nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()), + numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), + numRetriesReceived(0), applyQueue(numLines), + // writeBackQueue(numLines), + nextMemoryEvent([this] { processNextMemoryEvent(); }, name()), + // nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()), nextRespondEvent([this] { processNextRespondEvent(); }, name()), nextApplyEvent([this] { processNextApplyEvent(); }, name()), - nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()), - nextRecvPushRetryEvent([this] { processNextRecvPushRetryEvent(); }, name()), + // nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()), + // nextRecvPushRetryEvent([this] { processNextRecvPushRetryEvent(); }, name()), stats(*this) { assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine)); @@ -69,49 +67,6 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): needsPush.reset(); } -// void -// CoalesceEngine::startup() -// { -// return; - // std::cout << "Hello" << std::endl; - // DPRINTF(CoalesceEngine, "%s: Range attached to this engine is %s.\n", - // __func__, peerMemoryRange.to_string()); - // AddrRangeList vertex_ranges = getAddrRanges(); - - // bool found = false; - // Addr first_match_addr = 0; - // while(true) { - // for (auto range: vertex_ranges) { - // if (range.contains(first_match_addr)) { - // found = true; - // break; - // } - // } - // if (found) { - // break; - // } - // first_match_addr += peerMemoryAtomSize; - // } - - // found = false; - // Addr second_match_addr = first_match_addr + peerMemoryAtomSize; - // while(true) { - // for (auto range: vertex_ranges) { - // if (range.contains(second_match_addr)) { - // found = true; - // break; - // } - // } - // if (found) { - // break; - // } - // second_match_addr += peerMemoryAtomSize; - // } - - // nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize); - // memoryAddressOffset = first_match_addr; -// } - void CoalesceEngine::registerWLEngine(WLEngine* wl_engine) { @@ -260,15 +215,20 @@ CoalesceEngine::recvWLRead(Addr addr) DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " "for cacheBlocks[%d].\n", __func__, addr, block_index); - fillQueue.push_back(block_index); - assert(fillQueue.size() <= numLines); + // fillQueue.push_back(block_index); + // assert(fillQueue.size() <= numLines); + memoryFunctionQueue.emplace_back([this] (int block_index) { processNextMemoryReadEvent(block_index); }, block_index); // FIXME: Fix this DPRINTF // DPRINTF(CoalesceEngine, "%s: Pushed pkt index " // "lineFillBuffer. lineFillBuffer.size = %d.\n", // __func__, fillQueue.size()); - if ((!nextMemoryReadEvent.pending()) && - (!nextMemoryReadEvent.scheduled())) { - schedule(nextMemoryReadEvent, nextCycle()); + // if ((!nextMemoryReadEvent.pending()) && + // (!nextMemoryReadEvent.scheduled())) { + // schedule(nextMemoryReadEvent, nextCycle()); + // } + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); } stats.readMisses++; stats.numVertexReads++; @@ -309,24 +269,24 @@ CoalesceEngine::recvWLRead(Addr addr) } void -CoalesceEngine::processNextMemoryReadEvent() +CoalesceEngine::processNextMemoryReadEvent(int block_index) { - assert(!nextMemoryReadEvent.pending()); - if (memPort.blocked()) { - // TODO: Implement interface where events of the CoalesceEngine are - // pushed to a fifo to be scheduled later. - nextMemoryReadEvent.sleep(); - pendingEventQueue.push_back("nextMemoryReadEvent"); - // Maximum three MemoryEvents. - assert(pendingEventQueue.size() <= 3); - DPRINTF(CoalesceEngine, "%s: nextMemoryReadEvent is asleep now and " - "has been pushed to pendingEventQueue. " - "pendingEventQueue.size = %d.\n", - __func__, pendingEventQueue.size()); - return; - } + // assert(!nextMemoryReadEvent.pending()); + // if (memPort.blocked()) { + // // TODO: Implement interface where events of the CoalesceEngine are + // // pushed to a fifo to be scheduled later. + // nextMemoryReadEvent.sleep(); + // pendingEventQueue.push_back("nextMemoryReadEvent"); + // // Maximum three MemoryEvents. + // assert(pendingEventQueue.size() <= 3); + // DPRINTF(CoalesceEngine, "%s: nextMemoryReadEvent is asleep now and " + // "has been pushed to pendingEventQueue. " + // "pendingEventQueue.size = %d.\n", + // __func__, pendingEventQueue.size()); + // return; + // } - int block_index = fillQueue.front(); + // int block_index = fillQueue.front(); PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr, peerMemoryAtomSize); DPRINTF(CoalesceEngine, "%s: Created a read packet. addr = %lu, " @@ -334,13 +294,11 @@ CoalesceEngine::processNextMemoryReadEvent() memPort.sendPacket(pkt); - fillQueue.pop_front(); + // fillQueue.pop_front(); - if (!fillQueue.empty()) { - assert(!nextMemoryReadEvent.scheduled()); - assert(!nextMemoryReadEvent.pending()); - schedule(nextMemoryReadEvent, nextCycle()); - } + // if (!fillQueue.empty()) { + // memoryFunctionQueue.push_back([this] { processNextMemoryReadEvent(); }); + // } } // TODO: For loop to empty the entire responseQueue. @@ -370,38 +328,70 @@ CoalesceEngine::processNextRespondEvent() } void -CoalesceEngine::recvMemRetry() +CoalesceEngine::processNextMemoryEvent() { - DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__); - if (pendingEventQueue.empty()) { - DPRINTF(CoalesceEngine, "%s: No events pending.\n", __func__); + if (memPort.blocked()) { + nextMemoryEvent.sleep(); return; } - std::string front = pendingEventQueue.front(); - DPRINTF(CoalesceEngine, "%s: %s is pending MemRetry.\n", __func__, front); - - if (front == "nextMemoryReadEvent") { - assert(!nextMemoryReadEvent.scheduled()); - assert(nextMemoryReadEvent.pending()); - schedule(nextMemoryReadEvent, nextCycle()); - nextMemoryReadEvent.wake(); - } else if (front == "nextWriteBackEvent") { - assert(!nextWriteBackEvent.scheduled()); - assert(nextWriteBackEvent.pending()); - schedule(nextWriteBackEvent, nextCycle()); - nextWriteBackEvent.wake(); - } else if (front == "nextRecvPushRetryEvent") { - assert(!nextRecvPushRetryEvent.scheduled()); - assert(nextRecvPushRetryEvent.pending()); - schedule(nextRecvPushRetryEvent, nextCycle()); - nextRecvPushRetryEvent.wake(); - } else { - panic("EVENT IS NOT RECOGNIZED.\n"); + std::function next_memory_function; + int next_memory_function_input; + std::tie(next_memory_function, next_memory_function_input) = memoryFunctionQueue.front(); + next_memory_function(next_memory_function_input); + memoryFunctionQueue.pop_front(); + DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. " + "memoryFunctionQueue.size = %d.\n", __func__, + memoryFunctionQueue.size()); + + assert(!nextMemoryEvent.pending()); + assert(!nextMemoryEvent.scheduled()); + if ((!memoryFunctionQueue.empty())) { + schedule(nextMemoryEvent, nextCycle()); } +} - pendingEventQueue.pop_front(); - return; +void +CoalesceEngine::recvMemRetry() +{ + DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__); + // if (pendingEventQueue.empty()) { + // DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__); + // return; + // } + + // std::string front = pendingEventQueue.front(); + // DPRINTF(CoalesceEngine, "%s: %s is pending MemRetry.\n", __func__, front); + + // if (front == "nextMemoryReadEvent") { + // assert(!nextMemoryReadEvent.scheduled()); + // assert(nextMemoryReadEvent.pending()); + // schedule(nextMemoryReadEvent, nextCycle()); + // nextMemoryReadEvent.wake(); + // } else if (front == "nextWriteBackEvent") { + // assert(!nextWriteBackEvent.scheduled()); + // assert(nextWriteBackEvent.pending()); + // schedule(nextWriteBackEvent, nextCycle()); + // nextWriteBackEvent.wake(); + // } else if (front == "nextRecvPushRetryEvent") { + // assert(!nextRecvPushRetryEvent.scheduled()); + // assert(nextRecvPushRetryEvent.pending()); + // schedule(nextRecvPushRetryEvent, nextCycle()); + // nextRecvPushRetryEvent.wake(); + // } else { + // panic("EVENT IS NOT RECOGNIZED.\n"); + // } + + // pendingEventQueue.pop_front(); + // return; + + if (!nextMemoryEvent.pending()) { + DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__); + return; + } + assert(!nextMemoryEvent.scheduled()); + nextMemoryEvent.wake(); + schedule(nextMemoryEvent, nextCycle()); } // FIXME: Fix this function. @@ -464,12 +454,17 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) deschedule(nextApplyEvent); } if (cacheBlocks[block_index].hasConflict) { - writeBackQueue.push_back(block_index); - assert(writeBackQueue.size() <= numLines); - if ((!nextWriteBackEvent.pending()) && - (!nextWriteBackEvent.scheduled())) { - schedule(nextWriteBackEvent, nextCycle()); - } + // writeBackQueue.push_back(block_index); + // assert(writeBackQueue.size() <= numLines); + memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index); + // if ((!nextWriteBackEvent.pending()) && + // (!nextWriteBackEvent.scheduled())) { + // schedule(nextWriteBackEvent, nextCycle()); + // } + // if ((!nextMemoryEvent.pending()) && + // (!nextMemoryEvent.scheduled())) { + // schedule(nextMemoryEvent, nextCycle()); + // } } } } else { @@ -528,9 +523,12 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) DPRINTF(CoalesceEngine, "%s: Received a read resposne for Addr: %lu.\n", __func__, pkt->getAddr()); - assert((cacheBlocks[block_index].allocated) && // allocated cache block - (!cacheBlocks[block_index].valid) && // valid is false - (!(MSHR.find(block_index) == MSHR.end()))); // allocated MSHR + // assert((cacheBlocks[block_index].allocated) && // allocated cache block + // (!cacheBlocks[block_index].valid) && // valid is false + // (!(MSHR.find(block_index) == MSHR.end()))); // allocated MSHR + assert(cacheBlocks[block_index].allocated); + assert(!cacheBlocks[block_index].valid); + assert(MSHR.find(block_index) != MSHR.end()); pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); @@ -691,22 +689,21 @@ CoalesceEngine::processNextApplyEvent() } // TODO: This is where eviction policy goes - if (cacheBlocks[block_index].hasConflict){ - writeBackQueue.push_back(block_index); - assert(writeBackQueue.size() <= numLines); - DPRINTF(CoalesceEngine, "%s: Added %d to writeBackQueue. " - "writeBackQueue.size = %u.\n", __func__, - block_index, writeBackQueue.size()); + if ((cacheBlocks[block_index].hasConflict) && + (cacheBlocks[block_index].busyMask == 0)) { + // writeBackQueue.push_back(block_index); + // assert(writeBackQueue.size() <= numLines); + memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index); + // DPRINTF(CoalesceEngine, "%s: Added %d to writeBackQueue. " + // "writeBackQueue.size = %u.\n", __func__, + // block_index, writeBackQueue.size()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } } applyQueue.pop_front(); - - if ((!writeBackQueue.empty()) && - (!nextWriteBackEvent.pending()) && - (!nextWriteBackEvent.scheduled())) { - schedule(nextWriteBackEvent, nextCycle()); - } - if ((!applyQueue.empty()) && (!nextApplyEvent.scheduled())) { schedule(nextApplyEvent, nextCycle()); @@ -714,22 +711,22 @@ CoalesceEngine::processNextApplyEvent() } void -CoalesceEngine::processNextWriteBackEvent() +CoalesceEngine::processNextWriteBackEvent(int block_index) { - assert(!nextWriteBackEvent.pending()); - if (memPort.blocked()) { - nextWriteBackEvent.sleep(); - pendingEventQueue.push_back("nextWriteBackEvent"); - // Maximum three MemoryEvent. - assert(pendingEventQueue.size() <= 3); - DPRINTF(CoalesceEngine, "%s: nextWriteBackEvent is asleep now and " - "has been pushed to pendingEventQueue. " - "pendingEventQueue.size = %d.\n", - __func__, pendingEventQueue.size()); - return; - } + // assert(!nextWriteBackEvent.pending()); + // if (memPort.blocked()) { + // nextWriteBackEvent.sleep(); + // pendingEventQueue.push_back("nextWriteBackEvent"); + // // Maximum three MemoryEvent. + // assert(pendingEventQueue.size() <= 3); + // DPRINTF(CoalesceEngine, "%s: nextWriteBackEvent is asleep now and " + // "has been pushed to pendingEventQueue. " + // "pendingEventQueue.size = %d.\n", + // __func__, pendingEventQueue.size()); + // return; + // } - int block_index = writeBackQueue.front(); + // int block_index = writeBackQueue.front(); // Why would we write it back if it does not have a conflict? assert(cacheBlocks[block_index].hasConflict); @@ -769,21 +766,35 @@ CoalesceEngine::processNextWriteBackEvent() cacheBlocks[block_index].dirty = false; DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for " "Addr: %lu.\n", __func__, block_index, aligned_miss_addr); - fillQueue.push_back(block_index); - assert(fillQueue.size() <= numLines); - if ((!nextMemoryReadEvent.pending()) && - (!nextMemoryReadEvent.scheduled())){ - schedule(nextMemoryReadEvent, nextCycle()); - } + // fillQueue.push_back(block_index); + // assert(fillQueue.size() <= numLines); + memoryFunctionQueue.emplace_back([this] (int block_index) { processNextMemoryReadEvent(block_index); }, block_index); + // if ((!nextMemoryReadEvent.pending()) && + // (!nextMemoryReadEvent.scheduled())){ + // schedule(nextMemoryReadEvent, nextCycle()); + // } + // if ((!nextMemoryEvent.pending()) && + // (!nextMemoryEvent.scheduled())) { + // schedule(nextMemoryEvent, nextCycle()); + // } } - writeBackQueue.pop_front(); - - if (!writeBackQueue.empty()) { - assert(!nextWriteBackEvent.pending()); - assert(!nextWriteBackEvent.scheduled()); - schedule(nextWriteBackEvent, nextCycle()); - } + // writeBackQueue.pop_front(); + // assert(writeBackQueue.size() <= numLines); + // DPRINTF(CoalesceEngine, "%s: Popped %d from writeBackQueue. " + // "writeBackQueue.size = %d, writeBackQueueSize = %d.\n", + // __func__, block_index, writeBackQueue.size(), numLines); + + // if (!writeBackQueue.empty()) { + // assert(!nextWriteBackEvent.pending()); + // assert(!nextWriteBackEvent.scheduled()); + // schedule(nextWriteBackEvent, nextCycle()); + // memoryFunctionQueue.push_back([this] { processNextWriteBackEvent(); }); + // if ((!nextMemoryEvent.pending()) && + // (!nextMemoryEvent.scheduled())) { + // schedule(nextMemoryEvent, nextCycle()); + // } + // } } void @@ -793,130 +804,28 @@ CoalesceEngine::recvPushRetry() DPRINTF(CoalesceEngine, "%s: Received a push retry.\n", __func__); // For now since we do only one retry at a time, we should not receive // a retry while this nextSendingRetryEvent is scheduled or is pending. - assert(!nextRecvPushRetryEvent.pending()); - assert(!nextRecvPushRetryEvent.scheduled()); + // assert(!nextRecvPushRetryEvent.pending()); + // assert(!nextRecvPushRetryEvent.scheduled()); assert(numRetriesReceived == 1); - schedule(nextRecvPushRetryEvent, nextCycle()); + // schedule(nextRecvPushRetryEvent, nextCycle()); + // TODO: Pass slice_base to getOptimalBitVectorSlice + memoryFunctionQueue.emplace_back([this] (int slice_base) { processNextRecvPushRetryEvent(slice_base); }, 0); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } } -// void -// CoalesceEngine::processNextRecvPushRetryEvent() -// { -// assert(!nextRecvPushRetryEvent.pending()); -// assert(needsPush.count() != 0); - -// Addr block_addr = 0; -// int block_index = 0; -// int it = 0; -// uint32_t slice = 0; -// bool hit_in_cache = false; - -// for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE; -// it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE) { -// for (int i = 0; i < numElementsPerLine; i++) { -// slice <<= 1; -// slice |= needsPush[it + i]; -// } -// if (slice) { -// block_addr = getBlockAddrFromBitIndex(it); -// block_index = getBlockIndex(block_addr); -// if ((cacheBlocks[block_index].addr == block_addr) && -// (cacheBlocks[block_index].valid)) { -// if (cacheBlocks[block_index].busyMask == 0) { -// hit_in_cache = true; -// break; -// } -// } else { -// hit_in_cache = false; -// break; -// } -// } -// } - -// assert(it < MAX_BITVECTOR_SIZE); -// if ((it + numElementsPerLine) > MAX_BITVECTOR_SIZE) { -// currentBitSliceIndex = 0; -// } else { -// currentBitSliceIndex = it + numElementsPerLine; -// } - -// DPRINTF(CoalesceEngine, "%s: Found slice with value %d at position %d " -// "in needsPush.\n", __func__, slice, it); - -// if (hit_in_cache) { -// int push_needed = 0; -// DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", -// __func__, needsPush.count()); -// assert(peerPushEngine->getNumRetries() == needsPush.count()); -// for (int i = 0; i < numElementsPerLine; i++) { -// // TODO: Make this more programmable -// uint32_t new_prop = std::min( -// cacheBlocks[block_index].items[i].prop, -// cacheBlocks[block_index].items[i].tempProp); -// cacheBlocks[block_index].items[i].tempProp = new_prop; -// cacheBlocks[block_index].items[i].prop = new_prop; -// if (needsPush[it + i] == 1) { -// peerPushEngine->recvWLItemRetry( -// cacheBlocks[block_index].items[i]); -// } -// push_needed += needsPush[it + i]; -// needsPush[it + i] = 0; -// } -// DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", -// __func__, needsPush.count()); -// peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed); -// assert(peerPushEngine->getNumRetries() == needsPush.count()); -// if (applyQueue.find(block_index)) { -// applyQueue.erase(block_index); -// if (applyQueue.empty() && nextApplyEvent.scheduled()) { -// deschedule(nextApplyEvent); -// } -// if (cacheBlocks[block_index].hasConflict) { -// writeBackQueue.push_back(block_index); -// assert(writeBackQueue.size() <= numLines); -// if ((!writeBackQueue.empty()) && -// (!nextWriteBackEvent.pending()) && -// (!nextWriteBackEvent.scheduled())) { -// schedule(nextWriteBackEvent, nextCycle()); -// } -// } -// } -// } else { -// if (memPort.blocked()) { -// nextRecvPushRetryEvent.sleep(); -// pendingEventQueue.push_back("nextRecvPushRetryEvent"); -// // Maximum three MemoryEvent. -// assert(pendingEventQueue.size() <= 3); -// return; -// } - -// // FIXME: Fix the retry mechanism between memory and cache to -// // handle memory retries correctly. This probably requires scheduling -// // an event for sending the retry. For now we're enabling infinite -// // queueing in the memQueue. -// // FIXME: Also do not send requests for cache lines that are already -// // read but await data. Just set a flag or sth. -// PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize); -// SenderState* sender_state = new SenderState(true); -// pkt->pushSenderState(sender_state); -// memPort.sendPacket(pkt); -// } - -// numRetriesReceived--; -// assert(numRetriesReceived == 0); -// assert(!nextRecvPushRetryEvent.scheduled()); -// } - std::tuple CoalesceEngine::getOptimalBitVectorSlice() { - bool hit_in_cache; + bool hit_in_cache = false; int slice_base = -1; - int score = 0; - int max_score_possible = 3 * numElementsPerLine; + // int score = 0; + // int max_score_possible = 3 * numElementsPerLine; for (int it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) { - int current_score = 0; + // int current_score = 0; uint32_t current_popcount = 0; for (int i = 0; i < numElementsPerLine; i++) { current_popcount += needsPush[it + i]; @@ -924,30 +833,32 @@ CoalesceEngine::getOptimalBitVectorSlice() if (current_popcount == 0) { continue; } - current_score += current_popcount; + // current_score += current_popcount; Addr addr = getBlockAddrFromBitIndex(it); int block_index = getBlockIndex(addr); if ((cacheBlocks[block_index].valid) && (cacheBlocks[block_index].addr == addr) && (cacheBlocks[block_index].busyMask == 0)) { - current_score += numElementsPerLine * 2; - if (current_score > score) { - score = current_score; - slice_base = it; - hit_in_cache = true; - if (score == max_score_possible) { - break; - } - } + // current_score += numElementsPerLine * 2; + // if (current_score > score) { + // score = current_score; + // slice_base = it; + // hit_in_cache = true; + // if (score == max_score_possible) { + // break; + // } + // } + return std::make_tuple(true, it); } else if (!((cacheBlocks[block_index].addr == addr) && (cacheBlocks[block_index].allocated))) { - score += numElementsPerLine; - if (current_score > score) { - score = current_score; - slice_base = it; - hit_in_cache = false; - assert(score < max_score_possible); - } + // score += numElementsPerLine; + // if (current_score > score) { + // score = current_score; + // slice_base = it; + // hit_in_cache = false; + // assert(score < max_score_possible); + // } + return std::make_tuple(false, it); } } @@ -955,11 +866,11 @@ CoalesceEngine::getOptimalBitVectorSlice() } void -CoalesceEngine::processNextRecvPushRetryEvent() +CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2) { bool hit_in_cache; int slice_base; - std::tie(hit_in_cache, slice_base)= getOptimalBitVectorSlice(); + std::tie(hit_in_cache, slice_base) = getOptimalBitVectorSlice(); if (slice_base != -1) { Addr addr = getBlockAddrFromBitIndex(slice_base); @@ -969,12 +880,12 @@ CoalesceEngine::processNextRecvPushRetryEvent() assert(cacheBlocks[block_index].busyMask == 0); // if nextRecvPushRetryEvent has been blocked by memory before - if (nextRecvPushRetryEvent.getPrevState() == -1) { - DPRINTF(CoalesceEngine, "%s: nextRecvPushRetry passing " - "its MemRetry.\n", __func__); - recvMemRetry(); - nextRecvPushRetryEvent.setPrevState(0); - } + // if (nextRecvPushRetryEvent.getPrevState() == -1) { + // DPRINTF(CoalesceEngine, "%s: nextRecvPushRetry passing " + // "its MemRetry.\n", __func__); + // recvMemRetry(); + // nextRecvPushRetryEvent.setPrevState(0); + // } int push_needed = 0; DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", @@ -1005,33 +916,38 @@ CoalesceEngine::processNextRecvPushRetryEvent() deschedule(nextApplyEvent); } if (cacheBlocks[block_index].hasConflict) { - writeBackQueue.push_back(block_index); - assert(writeBackQueue.size() <= numLines); - if ((!nextWriteBackEvent.pending()) && - (!nextWriteBackEvent.scheduled())) { - schedule(nextWriteBackEvent, nextCycle()); - } + // writeBackQueue.push_back(block_index); + // assert(writeBackQueue.size() <= numLines); + // if ((!nextWriteBackEvent.pending()) && + // (!nextWriteBackEvent.scheduled())) { + // schedule(nextWriteBackEvent, nextCycle()); + // } + memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index); + // if ((!nextMemoryEvent.pending()) && + // (!nextMemoryEvent.scheduled())) { + // schedule(nextMemoryEvent, nextCycle()); + // } } } } else { - if (memPort.blocked()) { - assert(nextRecvPushRetryEvent.getPrevState() != -1); - nextRecvPushRetryEvent.setPrevState(-1); - nextRecvPushRetryEvent.sleep(); - pendingEventQueue.push_back("nextRecvPushRetryEvent"); - assert(pendingEventQueue.size() <= 3); - DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is asleep now " - "and has been pushed to pendingEventQueue." - " pendingEventQueue.size = %d.\n", - __func__, pendingEventQueue.size()); - return; - } + // if (memPort.blocked()) { + // // assert(nextRecvPushRetryEvent.getPrevState() != -1); + // nextRecvPushRetryEvent.setPrevState(-1); + // nextRecvPushRetryEvent.sleep(); + // pendingEventQueue.push_back("nextRecvPushRetryEvent"); + // assert(pendingEventQueue.size() <= 3); + // DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is asleep now " + // "and has been pushed to pendingEventQueue." + // " pendingEventQueue.size = %d.\n", + // __func__, pendingEventQueue.size()); + // return; + // } // if nextRecvPushRetryEvent has been blocked by memory before - if (nextRecvPushRetryEvent.getPrevState() == -1) { - DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is " - "unblocked by memPort. Setting prevState to 0.\n", __func__); - nextRecvPushRetryEvent.setPrevState(0); - } + // if (nextRecvPushRetryEvent.getPrevState() == -1) { + // DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is " + // "unblocked by memPort. Setting prevState to 0.\n", __func__); + // nextRecvPushRetryEvent.setPrevState(0); + // } PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); SenderState* sender_state = new SenderState(true); @@ -1045,8 +961,15 @@ CoalesceEngine::processNextRecvPushRetryEvent() numRetriesReceived--; assert(numRetriesReceived == 0); } + // if (numRetriesReceived > 0) { + // schedule(nextRecvPushRetryEvent, nextCycle()); + // } if (numRetriesReceived > 0) { - schedule(nextRecvPushRetryEvent, nextCycle()); + memoryFunctionQueue.emplace_back([this] (int slice_base) { processNextRecvPushRetryEvent(slice_base); }, 0); + // if ((!nextMemoryEvent.pending()) && + // (!nextMemoryEvent.scheduled())) { + // schedule(nextMemoryEvent, nextCycle()); + // } } } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index f6ed4843fa..4036dc49af 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -92,26 +92,30 @@ class CoalesceEngine : public BaseMemoryEngine int numTgtsPerMSHR; std::unordered_map> MSHR; - std::deque fillQueue; + // std::deque fillQueue; std::deque> responseQueue; - int currentBitSliceIndex; int numRetriesReceived; InOutSet applyQueue; std::bitset needsPush; - InOutSet writeBackQueue; + // InOutSet writeBackQueue; + int getBlockIndex(Addr addr); int getBitIndexBase(Addr addr); Addr getBlockAddrFromBitIndex(int index); std::tuple getOptimalBitVectorSlice(); - std::deque pendingEventQueue; + // std::deque pendingEventQueue; + + std::deque, int>> memoryFunctionQueue; + MemoryEvent nextMemoryEvent; + void processNextMemoryEvent(); - MemoryEvent nextMemoryReadEvent; - void processNextMemoryReadEvent(); + // MemoryEvent nextMemoryReadEvent; + void processNextMemoryReadEvent(int block_index); EventFunctionWrapper nextRespondEvent; void processNextRespondEvent(); @@ -119,11 +123,11 @@ class CoalesceEngine : public BaseMemoryEngine EventFunctionWrapper nextApplyEvent; void processNextApplyEvent(); - MemoryEvent nextWriteBackEvent; - void processNextWriteBackEvent(); + // MemoryEvent nextWriteBackEvent; + void processNextWriteBackEvent(int block_index); - MemoryEvent nextRecvPushRetryEvent; - void processNextRecvPushRetryEvent(); + // MemoryEvent nextRecvPushRetryEvent; + void processNextRecvPushRetryEvent(int slice_base); struct CoalesceStats : public statistics::Group { From d00c61008d8ee2157b711441cd71a34ab32bb108 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 29 Jul 2022 10:59:33 -0700 Subject: [PATCH 136/247] Adding more dprintfs. --- src/accl/graph/base/data_structs.hh | 36 +- src/accl/graph/sega/base_memory_engine.cc | 8 +- src/accl/graph/sega/coalesce_engine.cc | 676 ++++++++-------------- src/accl/graph/sega/coalesce_engine.hh | 36 +- 4 files changed, 275 insertions(+), 481 deletions(-) diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index f178d5a7e2..707b57c56f 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -32,9 +32,7 @@ #include "base/cprintf.hh" #include "base/intmath.hh" -#include -#include -#include +#include namespace gem5 { @@ -90,49 +88,51 @@ static_assert(isPowerOf2(sizeof(WorkListItem))); static_assert(isPowerOf2(sizeof(Edge))); template -class InOutSet +class UniqueFIFO { private: - std::unordered_set set; + std::list fifo; public: - InOutSet(int cap) - { - set.reserve(cap); - } + UniqueFIFO() {} void push_back(T item) { - if (set.find(item) == set.end()) { - set.insert(item); + if (!find(item)) { + fifo.push_back(item); } } void pop_front() { - assert(set.begin() != set.end()); - set.erase(set.begin()); + assert(!fifo.empty()); + fifo.pop_front(); } T front() { - return *(set.begin()); + return fifo.front(); } size_t size() { - return set.size(); + return fifo.size(); } bool empty() { - return (size() == 0); + return fifo.empty(); } bool find(T item) { - return (set.find(item) != set.end()); + // std::list::iterator it = std::find(fifo.begin(), fifo.end(), item); + auto it = std::find(fifo.begin(), fifo.end(), item); + return (it != fifo.end()); } void erase(T item) { - set.erase(item); + // std::list::iterator it = std::find(fifo.begin(), fifo.end(), item); + auto it = std::find(fifo.begin(), fifo.end(), item); + assert(it != fifo.end()); + fifo.erase(it); } }; diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc index c60d189e0f..a5d1d7e8e7 100644 --- a/src/accl/graph/sega/base_memory_engine.cc +++ b/src/accl/graph/sega/base_memory_engine.cc @@ -73,15 +73,15 @@ void BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt) { panic_if(_blocked, "Should never try to send if blocked MemSide!"); + DPRINTF(BaseMemoryEngine, "%s: Sending pakcet: %s to " + "the memory.\n", __func__, pkt->print()); if (!sendTimingReq(pkt)) { blockedPacket = pkt; _blocked = true; - DPRINTF(BaseMemoryEngine, "%s: MemPort blocked. blockedPacket %s.\n", - __func__, blockedPacket->print()); + DPRINTF(BaseMemoryEngine, "%s: MemPort blocked.\n", __func__); } else { - DPRINTF(BaseMemoryEngine, "%s: Packet %s sent successfully.\n", - __func__, pkt->print()); + DPRINTF(BaseMemoryEngine, "%s: Packet sent successfully.\n", __func__); owner->recvMemRetry(); } } diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 4d7107274b..6ed94fe938 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -46,14 +46,16 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): numLines((int) (params.cache_size / peerMemoryAtomSize)), numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), - numRetriesReceived(0), applyQueue(numLines), - // writeBackQueue(numLines), - nextMemoryEvent([this] { processNextMemoryEvent(); }, name()), - // nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()), - nextRespondEvent([this] { processNextRespondEvent(); }, name()), - nextApplyEvent([this] { processNextApplyEvent(); }, name()), - // nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()), - // nextRecvPushRetryEvent([this] { processNextRecvPushRetryEvent(); }, name()), + numRetriesReceived(0), + nextMemoryEvent([this] { + processNextMemoryEvent(); + }, name() + ".nextMemoryEvent"), + nextResponseEvent([this] { + processNextResponseEvent(); + }, name() + ".nextResponseEvent"), + nextApplyEvent([this] { + processNextApplyEvent(); + }, name() + ".nextApplyEvent"), stats(*this) { assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine)); @@ -79,8 +81,6 @@ CoalesceEngine::getBlockIndex(Addr addr) { assert((addr % peerMemoryAtomSize) == 0); Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); - DPRINTF(CoalesceEngine, "%s: Trimming addr: %lu to %lu.\n", - __func__, addr, trimmed_addr); return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines; } @@ -108,21 +108,25 @@ bool CoalesceEngine::recvWLRead(Addr addr) { assert(MSHR.size() <= numMSHREntries); - DPRINTF(CoalesceEngine, "%s: Received a read request for address: %lu.\n", - __func__, addr); - Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize; + + Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); assert(aligned_addr % peerMemoryAtomSize == 0); - // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines; int block_index = getBlockIndex(aligned_addr); assert(block_index < numLines); int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); assert(wl_offset < numElementsPerLine); + DPRINTF(CoalesceEngine, "%s: Received a read request for addr: %lu. " + "This request maps to cacheBlocks[%d], aligned_addr: " + "%lu, and wl_offset: %d.\n", __func__, addr, + block_index, aligned_addr, wl_offset); if ((cacheBlocks[block_index].addr == aligned_addr) && (cacheBlocks[block_index].valid)) { + assert(cacheBlocks[block_index].allocated); + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit.\n", __func__, addr); // Hit // TODO: Add a hit latency as a param for this object. - // Can't just schedule the nextRespondEvent for latency cycles in + // Can't just schedule the nextResponseEvent for latency cycles in // the future. responseQueue.push_back(std::make_tuple(addr, cacheBlocks[block_index].items[wl_offset])); @@ -138,12 +142,12 @@ CoalesceEngine::recvWLRead(Addr addr) cacheBlocks[block_index].items[wl_offset].to_string(), responseQueue.size(), peerWLEngine->getRegisterFileSize()); - // TODO: Add a stat to count the number of WLItems that have been touched. + // TODO: Stat to count the number of WLItems that have been touched. cacheBlocks[block_index].busyMask |= (1 << wl_offset); stats.readHits++; - if (!nextRespondEvent.scheduled()) { - schedule(nextRespondEvent, nextCycle()); + if (!nextResponseEvent.scheduled()) { + schedule(nextResponseEvent, nextCycle()); } stats.numVertexReads++; return true; @@ -151,44 +155,50 @@ CoalesceEngine::recvWLRead(Addr addr) // miss DPRINTF(CoalesceEngine, "%s: Addr: %lu is a miss.\n", __func__, addr); if (MSHR.find(block_index) == MSHR.end()) { - DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for Addr: %lu not " - "found in MSHRs.\n", __func__, block_index, addr); + DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for Addr:" + " %lu not found in MSHRs.\n", __func__, block_index, addr); assert(MSHR.size() <= numMSHREntries); if (MSHR.size() == numMSHREntries) { // Out of MSHR entries DPRINTF(CoalesceEngine, "%s: Out of MSHR entries. " - "Rejecting request.\n", __func__); + "Rejecting request.\n", __func__); // TODO: Break out read rejections into more than one stat // based on the cause of the rejection stats.readRejections++; return false; } else { - DPRINTF(CoalesceEngine, "%s: MSHR entries available.\n", __func__); + DPRINTF(CoalesceEngine, "%s: MSHR " + "entries available.\n", __func__); if (cacheBlocks[block_index].allocated) { assert(MSHR[block_index].size() <= numTgtsPerMSHR); DPRINTF(CoalesceEngine, "%s: Addr: %lu has a conflict " "with Addr: %lu.\n", __func__, addr, cacheBlocks[block_index].addr); if (MSHR[block_index].size() == numTgtsPerMSHR) { - DPRINTF(CoalesceEngine, "%s: Out of targets for cacheBlocks[%d]. " - "Rejecting request.\n", + DPRINTF(CoalesceEngine, "%s: Out of targets for " + "cacheBlocks[%d]. Rejecting request.\n", __func__, block_index); stats.readRejections++; return false; } cacheBlocks[block_index].hasConflict = true; MSHR[block_index].push_back(addr); - DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets for cache " - "line[%d].\n", __func__, addr, block_index); + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " + "for cacheBlocks[%d].\n", __func__, addr, block_index); stats.readMisses++; stats.numVertexReads++; - if ((cacheBlocks[block_index].busyMask == 0) && (cacheBlocks[block_index].valid)) { - applyQueue.push_back(block_index); - DPRINTF(CoalesceEngine, "%s: Added %d to applyQueue. " - "applyQueue.size = %u.\n", __func__, - block_index, applyQueue.size()); + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not " + "busy. It %s in the applyQueue.\n", + __func__, block_index, + applyQueue.find(block_index) ? "is" : "is not"); + if (!applyQueue.find(block_index)) { + applyQueue.push_back(block_index); + DPRINTF(CoalesceEngine, "%s: Added %d to " + "applyQueue. applyQueue.size = %u.\n", + __func__, block_index, applyQueue.size()); + } assert(!applyQueue.empty()); if ((!nextApplyEvent.scheduled())) { schedule(nextApplyEvent, nextCycle()); @@ -208,24 +218,18 @@ CoalesceEngine::recvWLRead(Addr addr) cacheBlocks[block_index].allocated = true; cacheBlocks[block_index].valid = false; cacheBlocks[block_index].hasConflict = false; - DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for" + DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for" " Addr: %lu.\n", __func__, block_index, addr); - MSHR[block_index].push_back(addr); - DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " "for cacheBlocks[%d].\n", __func__, addr, block_index); - - // fillQueue.push_back(block_index); - // assert(fillQueue.size() <= numLines); - memoryFunctionQueue.emplace_back([this] (int block_index) { processNextMemoryReadEvent(block_index); }, block_index); - // FIXME: Fix this DPRINTF - // DPRINTF(CoalesceEngine, "%s: Pushed pkt index " - // "lineFillBuffer. lineFillBuffer.size = %d.\n", - // __func__, fillQueue.size()); - // if ((!nextMemoryReadEvent.pending()) && - // (!nextMemoryReadEvent.scheduled())) { - // schedule(nextMemoryReadEvent, nextCycle()); - // } + memoryFunctionQueue.emplace_back( + [this] (int block_index) { + processNextRead(block_index); + }, block_index); + DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for " + "input %d to memoryFunctionQueue.\n", + __func__, block_index); if ((!nextMemoryEvent.pending()) && (!nextMemoryEvent.scheduled())) { schedule(nextMemoryEvent, nextCycle()); @@ -236,21 +240,23 @@ CoalesceEngine::recvWLRead(Addr addr) } } } else { - DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for Addr: %lu already " - "in MSHRs.\n", __func__, block_index, addr); + DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for " + "Addr: %lu already in MSHRs.\n", __func__, block_index, addr); if (MSHR[block_index].size() == numTgtsPerMSHR) { - DPRINTF(CoalesceEngine, "%s: Out of targets for cacheBlocks[%d]. " - "Rejecting request.\n", - __func__, block_index); + DPRINTF(CoalesceEngine, "%s: Out of targets for " + "cacheBlocks[%d]. Rejecting request.\n", + __func__, block_index); stats.readRejections++; return false; } - if ((!cacheBlocks[block_index].hasConflict) && - (aligned_addr != cacheBlocks[block_index].addr)) { + if ((aligned_addr != cacheBlocks[block_index].addr)) { DPRINTF(CoalesceEngine, "%s: Addr: %lu has a conflict " "with Addr: %lu.\n", __func__, addr, cacheBlocks[block_index].addr); cacheBlocks[block_index].hasConflict = true; + } else { + DPRINTF(CoalesceEngine, "%s: There is room for another target " + "for cacheBlocks[%d].\n", __func__, block_index); } if (aligned_addr != cacheBlocks[block_index].addr) { @@ -260,295 +266,88 @@ CoalesceEngine::recvWLRead(Addr addr) } MSHR[block_index].push_back(addr); - DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets for cache " - "line[%d].\n", __func__, addr, block_index); + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets for " + "cacheBlocks[%d].\n", __func__, addr, block_index); stats.numVertexReads++; return true; } } } -void -CoalesceEngine::processNextMemoryReadEvent(int block_index) -{ - // assert(!nextMemoryReadEvent.pending()); - // if (memPort.blocked()) { - // // TODO: Implement interface where events of the CoalesceEngine are - // // pushed to a fifo to be scheduled later. - // nextMemoryReadEvent.sleep(); - // pendingEventQueue.push_back("nextMemoryReadEvent"); - // // Maximum three MemoryEvents. - // assert(pendingEventQueue.size() <= 3); - // DPRINTF(CoalesceEngine, "%s: nextMemoryReadEvent is asleep now and " - // "has been pushed to pendingEventQueue. " - // "pendingEventQueue.size = %d.\n", - // __func__, pendingEventQueue.size()); - // return; - // } - - // int block_index = fillQueue.front(); - PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr, - peerMemoryAtomSize); - DPRINTF(CoalesceEngine, "%s: Created a read packet. addr = %lu, " - "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); - - memPort.sendPacket(pkt); - - // fillQueue.pop_front(); - - // if (!fillQueue.empty()) { - // memoryFunctionQueue.push_back([this] { processNextMemoryReadEvent(); }); - // } -} - -// TODO: For loop to empty the entire responseQueue. -void -CoalesceEngine::processNextRespondEvent() -{ - Addr addr_response; - WorkListItem worklist_response; - - std::tie(addr_response, worklist_response) = responseQueue.front(); - peerWLEngine->handleIncomingWL(addr_response, worklist_response); - DPRINTF(CoalesceEngine, "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n", - __func__, worklist_response.to_string(), addr_response); - - responseQueue.pop_front(); - DPRINTF(SEGAStructureSize, "%s: Popped a response from responseQueue. " - "responseQueue.size = %d, responseQueueSize = %d.\n", __func__, - responseQueue.size(), peerWLEngine->getRegisterFileSize()); - DPRINTF(CoalesceEngine, "%s: Popped a response from responseQueue. " - "responseQueue.size = %d, responseQueueSize = %d.\n", __func__, - responseQueue.size(), peerWLEngine->getRegisterFileSize()); - - if ((!nextRespondEvent.scheduled()) && - (!responseQueue.empty())) { - schedule(nextRespondEvent, nextCycle()); - } -} - -void -CoalesceEngine::processNextMemoryEvent() -{ - if (memPort.blocked()) { - nextMemoryEvent.sleep(); - return; - } - - std::function next_memory_function; - int next_memory_function_input; - std::tie(next_memory_function, next_memory_function_input) = memoryFunctionQueue.front(); - next_memory_function(next_memory_function_input); - memoryFunctionQueue.pop_front(); - DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. " - "memoryFunctionQueue.size = %d.\n", __func__, - memoryFunctionQueue.size()); - - assert(!nextMemoryEvent.pending()); - assert(!nextMemoryEvent.scheduled()); - if ((!memoryFunctionQueue.empty())) { - schedule(nextMemoryEvent, nextCycle()); - } -} - -void -CoalesceEngine::recvMemRetry() -{ - DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__); - // if (pendingEventQueue.empty()) { - // DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__); - // return; - // } - - // std::string front = pendingEventQueue.front(); - // DPRINTF(CoalesceEngine, "%s: %s is pending MemRetry.\n", __func__, front); - - // if (front == "nextMemoryReadEvent") { - // assert(!nextMemoryReadEvent.scheduled()); - // assert(nextMemoryReadEvent.pending()); - // schedule(nextMemoryReadEvent, nextCycle()); - // nextMemoryReadEvent.wake(); - // } else if (front == "nextWriteBackEvent") { - // assert(!nextWriteBackEvent.scheduled()); - // assert(nextWriteBackEvent.pending()); - // schedule(nextWriteBackEvent, nextCycle()); - // nextWriteBackEvent.wake(); - // } else if (front == "nextRecvPushRetryEvent") { - // assert(!nextRecvPushRetryEvent.scheduled()); - // assert(nextRecvPushRetryEvent.pending()); - // schedule(nextRecvPushRetryEvent, nextCycle()); - // nextRecvPushRetryEvent.wake(); - // } else { - // panic("EVENT IS NOT RECOGNIZED.\n"); - // } - - // pendingEventQueue.pop_front(); - // return; - - if (!nextMemoryEvent.pending()) { - DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__); - return; - } - assert(!nextMemoryEvent.scheduled()); - nextMemoryEvent.wake(); - schedule(nextMemoryEvent, nextCycle()); -} - -// FIXME: Fix this function. bool CoalesceEngine::handleMemResp(PacketPtr pkt) { assert(pkt->isResponse()); + DPRINTF(CoalesceEngine, "%s: Received packet: %s from memory.\n", + __func__, pkt->print()); if (pkt->isWrite()) { + DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__); delete pkt; - DPRINTF(CoalesceEngine, "%s: Received a write response for Addr: %lu. Dropping " - "the packet.\n", __func__, pkt->getAddr()); return true; } + Addr addr = pkt->getAddr(); + int block_index = getBlockIndex(addr); + if (pkt->findNextSenderState()) { - Addr addr = pkt->getAddr(); + assert(!((cacheBlocks[block_index].addr == addr) && + (cacheBlocks[block_index].valid))); + // We have read the address to send the wl and it is not in the + // cache. Simply send the items to the PushEngine. int it = getBitIndexBase(addr); - int block_index = getBlockIndex(addr); - - if ((cacheBlocks[block_index].addr == addr) && - (cacheBlocks[block_index].valid)) { - // We read the address to send the wl but it is put in cache before - // the read response arrives. - if (cacheBlocks[block_index].busyMask == 0) { - DPRINTF(CoalesceEngine, "%s: Received read response for retry " - "for addr %lu. It was found in the cache as idle.\n", - __func__, addr); - int push_needed = 0; - // It is not busy anymore, we have to send the wl from cache. - DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", - __func__, needsPush.count()); - assert(peerPushEngine->getNumRetries() == needsPush.count()); - for (int i = 0; i < numElementsPerLine; i++) { - assert(!((needsPush[it + i] == 1) && - (cacheBlocks[block_index].items[i].degree == 0))); - // TODO: Make this more programmable - uint32_t new_prop = std::min( - cacheBlocks[block_index].items[i].prop, - cacheBlocks[block_index].items[i].tempProp); - cacheBlocks[block_index].items[i].tempProp = new_prop; - cacheBlocks[block_index].items[i].prop = new_prop; - if (needsPush[it + i] == 1) { - peerPushEngine->recvWLItemRetry( - cacheBlocks[block_index].items[i]); - } - push_needed += needsPush[it + i]; - needsPush[it + i] = 0; - } - DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", - __func__, needsPush.count()); - peerPushEngine->deallocatePushSpace( - numElementsPerLine - push_needed); - assert(peerPushEngine->getNumRetries() == needsPush.count()); - // Since we have just applied the line, we can take it out of - // the applyQueue if it's in there. No need to do the same - // thing for evictQueue. - if (applyQueue.find(block_index)) { - applyQueue.erase(block_index); - if (applyQueue.empty() && nextApplyEvent.scheduled()) { - deschedule(nextApplyEvent); - } - if (cacheBlocks[block_index].hasConflict) { - // writeBackQueue.push_back(block_index); - // assert(writeBackQueue.size() <= numLines); - memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index); - // if ((!nextWriteBackEvent.pending()) && - // (!nextWriteBackEvent.scheduled())) { - // schedule(nextWriteBackEvent, nextCycle()); - // } - // if ((!nextMemoryEvent.pending()) && - // (!nextMemoryEvent.scheduled())) { - // schedule(nextMemoryEvent, nextCycle()); - // } - } - } - } else { - // The line is busy. Therefore, we have to disregard the data - // we received from the memory and also tell the push engine to - // deallocate the space it allocated for this retry. However, - // we still have to rememeber that these items need a retry. - // i.e. don't change needsPush, call recvWLItemRetry with - // do_push = false - DPRINTF(CoalesceEngine, "%s: Received read response for retry " - "for addr %lu. It was found in the cache as busy.\n", - __func__, addr); - DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", - __func__, needsPush.count()); - assert(peerPushEngine->getNumRetries() == needsPush.count()); - peerPushEngine->deallocatePushSpace(numElementsPerLine); - assert(peerPushEngine->getNumRetries() == needsPush.count()); - DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", - __func__, needsPush.count()); - } - } else { - // We have read the address to send the wl and it is not in the - // cache. Simply send the items to the PushEngine. - DPRINTF(CoalesceEngine, "%s: Received read response for retry " - "for addr %lu. It was not found in the cache.\n", - __func__, addr); - WorkListItem* items = pkt->getPtr(); - int push_needed = 0; - // No applying of the line needed. - DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", - __func__, needsPush.count()); - assert(peerPushEngine->getNumRetries() == needsPush.count()); - for (int i = 0; i < numElementsPerLine; i++) { - assert(!((needsPush[it + i] == 1) && - (items[i].degree == 0))); - if (needsPush[it + i] == 1) { - peerPushEngine->recvWLItemRetry(items[i]); - } - push_needed += needsPush[it + i]; - needsPush[it + i] = 0; + DPRINTF(CoalesceEngine, "%s: Received read response for retry " + "for addr %lu. It was not found in the cache.\n", + __func__, addr); + WorkListItem* items = pkt->getPtr(); + int push_needed = 0; + // No applying of the line needed. + DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", + __func__, needsPush.count()); + assert(peerPushEngine->getNumRetries() == needsPush.count()); + for (int i = 0; i < numElementsPerLine; i++) { + assert(!((needsPush[it + i] == 1) && + (items[i].degree == 0))); + if (needsPush[it + i] == 1) { + peerPushEngine->recvWLItemRetry(items[i]); } - DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", - __func__, needsPush.count()); - peerPushEngine->deallocatePushSpace( - numElementsPerLine - push_needed); - assert(peerPushEngine->getNumRetries() == needsPush.count()); + push_needed += needsPush[it + i]; + needsPush[it + i] = 0; } - + DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", + __func__, needsPush.count()); + peerPushEngine->deallocatePushSpace( + numElementsPerLine - push_needed); + assert(peerPushEngine->getNumRetries() == needsPush.count()); + // } delete pkt; return true; } - Addr addr = pkt->getAddr(); - // int block_index = (addr / peerMemoryAtomSize) % numLines; - int block_index = getBlockIndex(addr); - - DPRINTF(CoalesceEngine, "%s: Received a read resposne for Addr: %lu.\n", - __func__, pkt->getAddr()); - // assert((cacheBlocks[block_index].allocated) && // allocated cache block - // (!cacheBlocks[block_index].valid) && // valid is false - // (!(MSHR.find(block_index) == MSHR.end()))); // allocated MSHR - assert(cacheBlocks[block_index].allocated); - assert(!cacheBlocks[block_index].valid); - assert(MSHR.find(block_index) != MSHR.end()); - pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items, + if (cacheBlocks[block_index].addr == addr) { + assert(cacheBlocks[block_index].allocated); + assert(!cacheBlocks[block_index].valid); + assert(MSHR.find(block_index) != MSHR.end()); + pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); - - for (int i = 0; i < numElementsPerLine; i++) { - DPRINTF(CoalesceEngine, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__, - block_index, i, cacheBlocks[block_index].items[i].to_string()); + for (int i = 0; i < numElementsPerLine; i++) { + DPRINTF(CoalesceEngine, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", + __func__, block_index, i, + cacheBlocks[block_index].items[i].to_string()); + } + cacheBlocks[block_index].valid = true; + delete pkt; } - cacheBlocks[block_index].valid = true; - delete pkt; // FIXME: Get rid of servicedIndices (maybe use an iterator) std::vector servicedIndices; for (int i = 0; i < MSHR[block_index].size(); i++) { Addr miss_addr = MSHR[block_index][i]; - Addr aligned_miss_addr = roundDown(miss_addr, peerMemoryAtomSize); + Addr aligned_miss_addr = roundDown(miss_addr, peerMemoryAtomSize); if (aligned_miss_addr == addr) { int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); - DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for cacheBlocks[%d] could " - "be serviced with the received packet.\n", - __func__, miss_addr, block_index); + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " + "cacheBlocks[%d] can be serviced with the received " + "packet.\n",__func__, miss_addr, block_index); // TODO: Make this block of code into a function responseQueue.push_back(std::make_tuple(miss_addr, cacheBlocks[block_index].items[wl_offset])); @@ -567,10 +366,9 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) // TODO: Add a stat to count the number of WLItems that have been touched. cacheBlocks[block_index].busyMask |= (1 << wl_offset); // End of the said block - servicedIndices.push_back(i); - DPRINTF(CoalesceEngine, "%s: Added index: %d of MSHR for cacheBlocks[%d] for " - "removal.\n", __func__, i, block_index); + // DPRINTF(CoalesceEngine, "%s: Added index: %d of MSHR for cacheBlocks[%d] for " + // "removal.\n", __func__, i, block_index); } } @@ -593,19 +391,46 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) assert(cacheBlocks[block_index].hasConflict); } - if ((!nextRespondEvent.scheduled()) && + if ((!nextResponseEvent.scheduled()) && (!responseQueue.empty())) { - schedule(nextRespondEvent, nextCycle()); + schedule(nextResponseEvent, nextCycle()); } return true; } +// TODO: For loop to empty the entire responseQueue. +void +CoalesceEngine::processNextResponseEvent() +{ + Addr addr_response; + WorkListItem worklist_response; + + std::tie(addr_response, worklist_response) = responseQueue.front(); + peerWLEngine->handleIncomingWL(addr_response, worklist_response); + DPRINTF(CoalesceEngine, + "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n", + __func__, worklist_response.to_string(), addr_response); + + responseQueue.pop_front(); + DPRINTF(SEGAStructureSize, "%s: Popped a response from responseQueue. " + "responseQueue.size = %d, responseQueueSize = %d.\n", __func__, + responseQueue.size(), peerWLEngine->getRegisterFileSize()); + DPRINTF(CoalesceEngine, "%s: Popped a response from responseQueue. " + "responseQueue.size = %d, responseQueueSize = %d.\n", __func__, + responseQueue.size(), peerWLEngine->getRegisterFileSize()); + + if ((!nextResponseEvent.scheduled()) && + (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } +} + void CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) { // TODO: Parameterize all the numbers here. - Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); + Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines; int block_index = getBlockIndex(aligned_addr); int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); @@ -691,12 +516,11 @@ CoalesceEngine::processNextApplyEvent() // TODO: This is where eviction policy goes if ((cacheBlocks[block_index].hasConflict) && (cacheBlocks[block_index].busyMask == 0)) { - // writeBackQueue.push_back(block_index); - // assert(writeBackQueue.size() <= numLines); - memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index); - // DPRINTF(CoalesceEngine, "%s: Added %d to writeBackQueue. " - // "writeBackQueue.size = %u.\n", __func__, - // block_index, writeBackQueue.size()); + memoryFunctionQueue.emplace_back([this] (int block_index) { + processNextWriteBack(block_index); + }, block_index); + DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input %d " + "to memoryFunctionQueue.\n", __func__, block_index); if ((!nextMemoryEvent.pending()) && (!nextMemoryEvent.scheduled())) { schedule(nextMemoryEvent, nextCycle()); @@ -711,23 +535,47 @@ CoalesceEngine::processNextApplyEvent() } void -CoalesceEngine::processNextWriteBackEvent(int block_index) +CoalesceEngine::processNextMemoryEvent() { - // assert(!nextWriteBackEvent.pending()); - // if (memPort.blocked()) { - // nextWriteBackEvent.sleep(); - // pendingEventQueue.push_back("nextWriteBackEvent"); - // // Maximum three MemoryEvent. - // assert(pendingEventQueue.size() <= 3); - // DPRINTF(CoalesceEngine, "%s: nextWriteBackEvent is asleep now and " - // "has been pushed to pendingEventQueue. " - // "pendingEventQueue.size = %d.\n", - // __func__, pendingEventQueue.size()); - // return; - // } - - // int block_index = writeBackQueue.front(); + if (memPort.blocked()) { + nextMemoryEvent.sleep(); + return; + } + DPRINTF(CoalesceEngine, "%s: Processing another " + "memory function.\n", __func__); + std::function next_memory_function; + int next_memory_function_input; + std::tie( + next_memory_function, + next_memory_function_input) = memoryFunctionQueue.front(); + next_memory_function(next_memory_function_input); + memoryFunctionQueue.pop_front(); + DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. " + "memoryFunctionQueue.size = %d.\n", __func__, + memoryFunctionQueue.size()); + + assert(!nextMemoryEvent.pending()); + assert(!nextMemoryEvent.scheduled()); + if ((!memoryFunctionQueue.empty())) { + schedule(nextMemoryEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextRead(int block_index) +{ + PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr, + peerMemoryAtomSize); + DPRINTF(CoalesceEngine, "%s: Created a read packet. addr = %lu, " + "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); + + memPort.sendPacket(pkt); +} + +void +CoalesceEngine::processNextWriteBack(int block_index) +{ // Why would we write it back if it does not have a conflict? assert(cacheBlocks[block_index].hasConflict); @@ -749,6 +597,10 @@ CoalesceEngine::processNextWriteBackEvent(int block_index) "Addr: %lu, size = %d.\n", __func__, write_pkt->getAddr(), write_pkt->getSize()); memPort.sendPacket(write_pkt); + } else { + DPRINTF(CoalesceEngine, "%s: No change observed on " + "cacheBlocks[%d]. No write back needed.\n", + __func__, block_index); } assert(!MSHR[block_index].empty()); Addr miss_addr = MSHR[block_index].front(); @@ -756,7 +608,7 @@ CoalesceEngine::processNextWriteBackEvent(int block_index) "cacheBlocks[%d] is Addr: %lu.\n", __func__, block_index, miss_addr); Addr aligned_miss_addr = - roundDown(miss_addr, peerMemoryAtomSize); + roundDown(miss_addr, peerMemoryAtomSize); cacheBlocks[block_index].addr = aligned_miss_addr; cacheBlocks[block_index].busyMask = 0; @@ -766,53 +618,12 @@ CoalesceEngine::processNextWriteBackEvent(int block_index) cacheBlocks[block_index].dirty = false; DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for " "Addr: %lu.\n", __func__, block_index, aligned_miss_addr); - // fillQueue.push_back(block_index); - // assert(fillQueue.size() <= numLines); - memoryFunctionQueue.emplace_back([this] (int block_index) { processNextMemoryReadEvent(block_index); }, block_index); - // if ((!nextMemoryReadEvent.pending()) && - // (!nextMemoryReadEvent.scheduled())){ - // schedule(nextMemoryReadEvent, nextCycle()); - // } - // if ((!nextMemoryEvent.pending()) && - // (!nextMemoryEvent.scheduled())) { - // schedule(nextMemoryEvent, nextCycle()); - // } - } - - // writeBackQueue.pop_front(); - // assert(writeBackQueue.size() <= numLines); - // DPRINTF(CoalesceEngine, "%s: Popped %d from writeBackQueue. " - // "writeBackQueue.size = %d, writeBackQueueSize = %d.\n", - // __func__, block_index, writeBackQueue.size(), numLines); - - // if (!writeBackQueue.empty()) { - // assert(!nextWriteBackEvent.pending()); - // assert(!nextWriteBackEvent.scheduled()); - // schedule(nextWriteBackEvent, nextCycle()); - // memoryFunctionQueue.push_back([this] { processNextWriteBackEvent(); }); - // if ((!nextMemoryEvent.pending()) && - // (!nextMemoryEvent.scheduled())) { - // schedule(nextMemoryEvent, nextCycle()); - // } - // } -} -void -CoalesceEngine::recvPushRetry() -{ - numRetriesReceived++; - DPRINTF(CoalesceEngine, "%s: Received a push retry.\n", __func__); - // For now since we do only one retry at a time, we should not receive - // a retry while this nextSendingRetryEvent is scheduled or is pending. - // assert(!nextRecvPushRetryEvent.pending()); - // assert(!nextRecvPushRetryEvent.scheduled()); - assert(numRetriesReceived == 1); - // schedule(nextRecvPushRetryEvent, nextCycle()); - // TODO: Pass slice_base to getOptimalBitVectorSlice - memoryFunctionQueue.emplace_back([this] (int slice_base) { processNextRecvPushRetryEvent(slice_base); }, 0); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); + memoryFunctionQueue.emplace_back([this] (int block_index) { + processNextRead(block_index); + }, block_index); + DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input %d to " + "memoryFunctionQueue.\n", __func__, block_index); } } @@ -866,7 +677,7 @@ CoalesceEngine::getOptimalBitVectorSlice() } void -CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2) +CoalesceEngine::processNextPushRetry(int slice_base_2) { bool hit_in_cache; int slice_base; @@ -879,14 +690,6 @@ CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2) assert(cacheBlocks[block_index].valid); assert(cacheBlocks[block_index].busyMask == 0); - // if nextRecvPushRetryEvent has been blocked by memory before - // if (nextRecvPushRetryEvent.getPrevState() == -1) { - // DPRINTF(CoalesceEngine, "%s: nextRecvPushRetry passing " - // "its MemRetry.\n", __func__); - // recvMemRetry(); - // nextRecvPushRetryEvent.setPrevState(0); - // } - int push_needed = 0; DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", __func__, needsPush.count()); @@ -916,39 +719,15 @@ CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2) deschedule(nextApplyEvent); } if (cacheBlocks[block_index].hasConflict) { - // writeBackQueue.push_back(block_index); - // assert(writeBackQueue.size() <= numLines); - // if ((!nextWriteBackEvent.pending()) && - // (!nextWriteBackEvent.scheduled())) { - // schedule(nextWriteBackEvent, nextCycle()); - // } - memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index); - // if ((!nextMemoryEvent.pending()) && - // (!nextMemoryEvent.scheduled())) { - // schedule(nextMemoryEvent, nextCycle()); - // } + memoryFunctionQueue.emplace_back([this] (int block_index) { + processNextWriteBack(block_index); + }, block_index); + DPRINTF(CoalesceEngine, "%s: Pushed nextWriteBackEvent for" + " input %d to memoryFunctionQueue.\n", + __func__, block_index); } } } else { - // if (memPort.blocked()) { - // // assert(nextRecvPushRetryEvent.getPrevState() != -1); - // nextRecvPushRetryEvent.setPrevState(-1); - // nextRecvPushRetryEvent.sleep(); - // pendingEventQueue.push_back("nextRecvPushRetryEvent"); - // assert(pendingEventQueue.size() <= 3); - // DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is asleep now " - // "and has been pushed to pendingEventQueue." - // " pendingEventQueue.size = %d.\n", - // __func__, pendingEventQueue.size()); - // return; - // } - // if nextRecvPushRetryEvent has been blocked by memory before - // if (nextRecvPushRetryEvent.getPrevState() == -1) { - // DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is " - // "unblocked by memPort. Setting prevState to 0.\n", __func__); - // nextRecvPushRetryEvent.setPrevState(0); - // } - PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); SenderState* sender_state = new SenderState(true); pkt->pushSenderState(sender_state); @@ -961,18 +740,53 @@ CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2) numRetriesReceived--; assert(numRetriesReceived == 0); } - // if (numRetriesReceived > 0) { - // schedule(nextRecvPushRetryEvent, nextCycle()); - // } + if (numRetriesReceived > 0) { - memoryFunctionQueue.emplace_back([this] (int slice_base) { processNextRecvPushRetryEvent(slice_base); }, 0); - // if ((!nextMemoryEvent.pending()) && - // (!nextMemoryEvent.scheduled())) { - // schedule(nextMemoryEvent, nextCycle()); - // } + memoryFunctionQueue.emplace_back([this] (int slice_base) { + processNextPushRetry(slice_base); + }, 0); + DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input " + "0 to memoryFunctionQueue.\n", __func__); + } +} + +void +CoalesceEngine::recvMemRetry() +{ + DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__); + + if (!nextMemoryEvent.pending()) { + DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__); + return; + } + assert(!nextMemoryEvent.scheduled()); + nextMemoryEvent.wake(); + schedule(nextMemoryEvent, nextCycle()); +} + +void +CoalesceEngine::recvPushRetry() +{ + numRetriesReceived++; + DPRINTF(CoalesceEngine, "%s: Received a push retry.\n", __func__); + // For now since we do only one retry at a time, we should not receive + // a retry while this nextSendingRetryEvent is scheduled or is pending. + assert(numRetriesReceived == 1); + + // TODO: Pass slice_base to getOptimalBitVectorSlice + memoryFunctionQueue.emplace_back([this] (int slice_base) { + processNextPushRetry(slice_base); + }, 0); + DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input 0 to " + "memoryFunctionQueue.\n", __func__); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); } } + + CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) : statistics::Group(&_coalesce), coalesce(_coalesce), diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 4036dc49af..7db09cec11 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -77,58 +77,40 @@ class CoalesceEngine : public BaseMemoryEngine SenderState(bool is_retry): isRetry(is_retry) {} }; - // int nmpu; - // Addr memoryAddressOffset; - WLEngine* peerWLEngine; PushEngine* peerPushEngine; - Block* cacheBlocks; - int numLines; int numElementsPerLine; + Block* cacheBlocks; int numMSHREntries; int numTgtsPerMSHR; std::unordered_map> MSHR; - - // std::deque fillQueue; - std::deque> responseQueue; int numRetriesReceived; - InOutSet applyQueue; + UniqueFIFO applyQueue; std::bitset needsPush; - // InOutSet writeBackQueue; - - int getBlockIndex(Addr addr); int getBitIndexBase(Addr addr); Addr getBlockAddrFromBitIndex(int index); std::tuple getOptimalBitVectorSlice(); - // std::deque pendingEventQueue; - - std::deque, int>> memoryFunctionQueue; MemoryEvent nextMemoryEvent; void processNextMemoryEvent(); + void processNextRead(int block_index); + void processNextWriteBack(int block_index); + void processNextPushRetry(int slice_base); + std::deque, int>> memoryFunctionQueue; - // MemoryEvent nextMemoryReadEvent; - void processNextMemoryReadEvent(int block_index); - - EventFunctionWrapper nextRespondEvent; - void processNextRespondEvent(); + EventFunctionWrapper nextResponseEvent; + void processNextResponseEvent(); EventFunctionWrapper nextApplyEvent; void processNextApplyEvent(); - // MemoryEvent nextWriteBackEvent; - void processNextWriteBackEvent(int block_index); - - // MemoryEvent nextRecvPushRetryEvent; - void processNextRecvPushRetryEvent(int slice_base); - struct CoalesceStats : public statistics::Group { CoalesceStats(CoalesceEngine &coalesce); @@ -164,8 +146,6 @@ class CoalesceEngine : public BaseMemoryEngine void registerWLEngine(WLEngine* wl_engine); void recvPushRetry(); - - // virtual void startup() override; }; } From 08ca0a193d0d22ef85cf5a95691a0317ff14c276 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 29 Jul 2022 16:59:30 -0700 Subject: [PATCH 137/247] Fixing cache block state machine. wip. --- src/accl/graph/sega/SConscript | 1 + src/accl/graph/sega/coalesce_engine.cc | 385 ++++++++++++++++++++++--- src/accl/graph/sega/coalesce_engine.hh | 31 +- src/accl/graph/sega/state_machine.md | 1 + 4 files changed, 368 insertions(+), 50 deletions(-) create mode 100644 src/accl/graph/sega/state_machine.md diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index 97a62d44a0..81a29df6af 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -43,5 +43,6 @@ DebugFlag('BaseMemoryEngine') DebugFlag('ApplyUpdates') DebugFlag('CenteralController') DebugFlag('CoalesceEngine') +DebugFlag('CacheBlockState') DebugFlag('PushEngine') DebugFlag('WLEngine') diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 6ed94fe938..a0c85de2f5 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -34,6 +34,7 @@ #include "base/intmath.hh" #include "debug/ApplyUpdates.hh" #include "debug/CoalesceEngine.hh" +#include "debug/CacheBlockState.hh" #include "debug/SEGAStructureSize.hh" #include "mem/packet_access.hh" @@ -104,11 +105,180 @@ CoalesceEngine::getBlockAddrFromBitIndex(int index) return peerMemoryRange.addIntlvBits(trimmed_addr); } +// TODO: Prev implementaton of recvWLRead. Remove +// bool +// CoalesceEngine::recvWLRead(Addr addr) +// { +// assert(MSHR.size() <= numMSHREntries); + +// Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); +// assert(aligned_addr % peerMemoryAtomSize == 0); +// int block_index = getBlockIndex(aligned_addr); +// assert(block_index < numLines); +// int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); +// assert(wl_offset < numElementsPerLine); +// DPRINTF(CoalesceEngine, "%s: Received a read request for addr: %lu. " +// "This request maps to cacheBlocks[%d], aligned_addr: " +// "%lu, and wl_offset: %d.\n", __func__, addr, +// block_index, aligned_addr, wl_offset); + +// if ((cacheBlocks[block_index].addr == aligned_addr) && +// (cacheBlocks[block_index].valid)) { +// assert(cacheBlocks[block_index].allocated); +// DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit.\n", __func__, addr); +// // Hit +// // TODO: Add a hit latency as a param for this object. +// // Can't just schedule the nextResponseEvent for latency cycles in +// // the future. +// responseQueue.push_back(std::make_tuple(addr, +// cacheBlocks[block_index].items[wl_offset])); +// DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " +// "to responseQueue. responseQueue.size = %d, " +// "responseQueueSize = %d.\n", __func__, addr, +// cacheBlocks[block_index].items[wl_offset].to_string(), +// responseQueue.size(), +// peerWLEngine->getRegisterFileSize()); +// DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " +// "to responseQueue. responseQueue.size = %d, " +// "responseQueueSize = %d.\n", __func__, addr, +// cacheBlocks[block_index].items[wl_offset].to_string(), +// responseQueue.size(), +// peerWLEngine->getRegisterFileSize()); +// // TODO: Stat to count the number of WLItems that have been touched. +// cacheBlocks[block_index].busyMask |= (1 << wl_offset); +// stats.readHits++; + +// if (!nextResponseEvent.scheduled()) { +// schedule(nextResponseEvent, nextCycle()); +// } +// stats.numVertexReads++; +// return true; +// } else { +// // miss +// DPRINTF(CoalesceEngine, "%s: Addr: %lu is a miss.\n", __func__, addr); +// if (MSHR.find(block_index) == MSHR.end()) { +// DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for Addr:" +// " %lu not found in MSHRs.\n", __func__, block_index, addr); +// assert(MSHR.size() <= numMSHREntries); +// if (MSHR.size() == numMSHREntries) { +// // Out of MSHR entries +// DPRINTF(CoalesceEngine, "%s: Out of MSHR entries. " +// "Rejecting request.\n", __func__); +// // TODO: Break out read rejections into more than one stat +// // based on the cause of the rejection +// stats.readRejections++; +// return false; +// } else { +// DPRINTF(CoalesceEngine, "%s: MSHR " +// "entries available.\n", __func__); +// if (cacheBlocks[block_index].allocated) { +// assert(MSHR[block_index].size() <= numTgtsPerMSHR); +// DPRINTF(CoalesceEngine, "%s: Addr: %lu has a conflict " +// "with Addr: %lu.\n", __func__, addr, +// cacheBlocks[block_index].addr); +// if (MSHR[block_index].size() == numTgtsPerMSHR) { +// DPRINTF(CoalesceEngine, "%s: Out of targets for " +// "cacheBlocks[%d]. Rejecting request.\n", +// __func__, block_index); +// stats.readRejections++; +// return false; +// } +// cacheBlocks[block_index].hasConflict = true; +// MSHR[block_index].push_back(addr); +// DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " +// "for cacheBlocks[%d].\n", __func__, addr, block_index); +// stats.readMisses++; +// stats.numVertexReads++; +// if ((cacheBlocks[block_index].busyMask == 0) && +// (cacheBlocks[block_index].valid)) { +// DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not " +// "busy. It %s in the applyQueue.\n", +// __func__, block_index, +// applyQueue.find(block_index) ? "is" : "is not"); +// if (!applyQueue.find(block_index)) { +// applyQueue.push_back(block_index); +// DPRINTF(CoalesceEngine, "%s: Added %d to " +// "applyQueue. applyQueue.size = %u.\n", +// __func__, block_index, applyQueue.size()); +// } +// assert(!applyQueue.empty()); +// if ((!nextApplyEvent.scheduled())) { +// schedule(nextApplyEvent, nextCycle()); +// } +// } +// return true; +// } else { +// assert(!cacheBlocks[block_index].valid); +// assert(MSHR[block_index].size() == 0); +// // MSHR available and no conflict +// DPRINTF(CoalesceEngine, "%s: Addr: %lu has no conflict. " +// "Allocating a cache line for it.\n" +// , __func__, addr); + +// cacheBlocks[block_index].addr = aligned_addr; +// cacheBlocks[block_index].busyMask = 0; +// cacheBlocks[block_index].allocated = true; +// cacheBlocks[block_index].valid = false; +// cacheBlocks[block_index].hasConflict = false; +// DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for" +// " Addr: %lu.\n", __func__, block_index, addr); +// MSHR[block_index].push_back(addr); +// DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " +// "for cacheBlocks[%d].\n", __func__, addr, block_index); +// memoryFunctionQueue.emplace_back( +// [this] (int block_index) { +// processNextRead(block_index); +// }, block_index); +// DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for " +// "input %d to memoryFunctionQueue.\n", +// __func__, block_index); +// if ((!nextMemoryEvent.pending()) && +// (!nextMemoryEvent.scheduled())) { +// schedule(nextMemoryEvent, nextCycle()); +// } +// stats.readMisses++; +// stats.numVertexReads++; +// return true; +// } +// } +// } else { +// DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for " +// "Addr: %lu already in MSHRs.\n", __func__, block_index, addr); +// if (MSHR[block_index].size() == numTgtsPerMSHR) { +// DPRINTF(CoalesceEngine, "%s: Out of targets for " +// "cacheBlocks[%d]. Rejecting request.\n", +// __func__, block_index); +// stats.readRejections++; +// return false; +// } +// if ((aligned_addr != cacheBlocks[block_index].addr)) { +// DPRINTF(CoalesceEngine, "%s: Addr: %lu has a conflict " +// "with Addr: %lu.\n", __func__, addr, +// cacheBlocks[block_index].addr); +// cacheBlocks[block_index].hasConflict = true; +// } else { +// DPRINTF(CoalesceEngine, "%s: There is room for another target " +// "for cacheBlocks[%d].\n", __func__, block_index); +// } + +// if (aligned_addr != cacheBlocks[block_index].addr) { +// stats.readMisses++; +// } else { +// stats.readHitUnderMisses++; +// } + +// MSHR[block_index].push_back(addr); +// DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets for " +// "cacheBlocks[%d].\n", __func__, addr, block_index); +// stats.numVertexReads++; +// return true; +// } +// } +// } + bool CoalesceEngine::recvWLRead(Addr addr) { - assert(MSHR.size() <= numMSHREntries); - Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); assert(aligned_addr % peerMemoryAtomSize == 0); int block_index = getBlockIndex(aligned_addr); @@ -119,11 +289,18 @@ CoalesceEngine::recvWLRead(Addr addr) "This request maps to cacheBlocks[%d], aligned_addr: " "%lu, and wl_offset: %d.\n", __func__, addr, block_index, aligned_addr, wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); if ((cacheBlocks[block_index].addr == aligned_addr) && (cacheBlocks[block_index].valid)) { - assert(cacheBlocks[block_index].allocated); DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit.\n", __func__, addr); + stats.readHits++; + assert(!cacheBlocks[block_index].pendingData); + // No cache block could be in pendingApply and pendingWB at the + // same time. + assert(!(cacheBlocks[block_index].pendingApply && + cacheBlocks[block_index].pendingWB)); // Hit // TODO: Add a hit latency as a param for this object. // Can't just schedule the nextResponseEvent for latency cycles in @@ -144,20 +321,60 @@ CoalesceEngine::recvWLRead(Addr addr) peerWLEngine->getRegisterFileSize()); // TODO: Stat to count the number of WLItems that have been touched. cacheBlocks[block_index].busyMask |= (1 << wl_offset); - stats.readHits++; + // If they are scheduled for apply and WB those schedules should be + // discarded. Since there is no easy way to take items out of the + // function queue. Those functions check for their respective bits + // and skip the process if the respective bit is set to false. + cacheBlocks[block_index].pendingApply = false; + cacheBlocks[block_index].pendingWB = false; + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); if (!nextResponseEvent.scheduled()) { schedule(nextResponseEvent, nextCycle()); } stats.numVertexReads++; return true; + } else if ((cacheBlocks[block_index].addr == aligned_addr) && + (cacheBlocks[block_index].pendingData)) { + // Hit under miss + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit under miss.\n", + __func__, addr); + stats.readHitUnderMisses++; + assert(!cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].needsWB); + assert(!cacheBlocks[block_index].needsApply); + assert(!cacheBlocks[block_index].pendingApply); + assert(!cacheBlocks[block_index].pendingWB); + + assert(MSHR.size() <= numMSHREntries); + assert(MSHR.find(block_index) != MSHR.end()); + assert(MSHR[block_index].size() <= numTgtsPerMSHR); + if (MSHR[block_index].size() == numTgtsPerMSHR) { + DPRINTF(CoalesceEngine, "%s: Out of targets for " + "cacheBlocks[%d]. Rejecting request.\n", + __func__, block_index); + stats.readRejections++; + return false; + } else { + DPRINTF(CoalesceEngine, "%s: MSHR entries are available for " + "cacheBlocks[%d].\n", __func__, block_index); + } + MSHR[block_index].push_back(addr); + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " + "for cacheBlocks[%d].\n", __func__, addr, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + return true; } else { // miss + assert(cacheBlocks[block_index].addr != aligned_addr); + assert(MSHR.size() <= numMSHREntries); DPRINTF(CoalesceEngine, "%s: Addr: %lu is a miss.\n", __func__, addr); if (MSHR.find(block_index) == MSHR.end()) { DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for Addr:" " %lu not found in MSHRs.\n", __func__, block_index, addr); - assert(MSHR.size() <= numMSHREntries); if (MSHR.size() == numMSHREntries) { // Out of MSHR entries DPRINTF(CoalesceEngine, "%s: Out of MSHR entries. " @@ -169,11 +386,12 @@ CoalesceEngine::recvWLRead(Addr addr) } else { DPRINTF(CoalesceEngine, "%s: MSHR " "entries available.\n", __func__); - if (cacheBlocks[block_index].allocated) { - assert(MSHR[block_index].size() <= numTgtsPerMSHR); + if ((cacheBlocks[block_index].valid) || + (cacheBlocks[block_index].pendingData)) { DPRINTF(CoalesceEngine, "%s: Addr: %lu has a conflict " "with Addr: %lu.\n", __func__, addr, cacheBlocks[block_index].addr); + assert(MSHR[block_index].size() <= numTgtsPerMSHR); if (MSHR[block_index].size() == numTgtsPerMSHR) { DPRINTF(CoalesceEngine, "%s: Out of targets for " "cacheBlocks[%d]. Rejecting request.\n", @@ -181,43 +399,116 @@ CoalesceEngine::recvWLRead(Addr addr) stats.readRejections++; return false; } - cacheBlocks[block_index].hasConflict = true; + if ((cacheBlocks[block_index].valid) && + (cacheBlocks[block_index].busyMask == 0) && + (!cacheBlocks[block_index].pendingApply) && + (!cacheBlocks[block_index].pendingWB)) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in " + "idle state.\n", __func__, block_index); + // We're in idle state + // Idle: valid && !pendingApply && !pendingWB; + // Note 0: needsApply has to be false. Because + // A cache line enters the idle state from two + // other states. First a busy state that does not + // need apply (needsApply is already false) or + // from pendingApplyState after being applied which + // clears the needsApply bit. needsApply is useful + // when a cache block has transitioned from + // pendingApply to busy without the apply happening. + // Note 1: pendingData does not have to be evaluated + // becuase pendingData is cleared when data + // arrives from the memory and valid does not + // denote cleanliness of the line. Rather it + // is used to differentiate between empty blocks + // and the blocks that have data from memory. + // pendingData denotes the transient state between + // getting a miss and getting the data for that miss. + // valid basically means that the data in the cache + // could be used to respond to read/write requests. + assert(!cacheBlocks[block_index].needsApply); + assert(!cacheBlocks[block_index].pendingData); + // There are no conflicts in idle state. + assert(MSHR.find(block_index) == MSHR.end()); + if (cacheBlocks[block_index].needsWB) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs" + "to be written back.\n", __func__, block_index); + cacheBlocks[block_index].pendingWB = true; + memoryFunctionQueue.emplace_back( + [this] (int block_index) { + processNextWriteBack(block_index); + }, block_index); + DPRINTF(CoalesceEngine, "%s: Pushed " + "processNextWriteBack for input " + "%d to memoryFunctionQueue.\n", + __func__, block_index); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: " + "%s.\n", __func__, block_index, + cacheBlocks[block_index].to_string()); + } else { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does" + "not need to be written back.\n", + __func__, block_index); + cacheBlocks[block_index].addr = aligned_addr; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].busyMask = 0; + cacheBlocks[block_index].needsWB = false; + cacheBlocks[block_index].needsApply = false; + cacheBlocks[block_index].pendingData = true; + cacheBlocks[block_index].pendingApply = false; + cacheBlocks[block_index].pendingWB = true; + memoryFunctionQueue.emplace_back( + [this] (int block_index) { + processNextRead(block_index); + }, block_index); + DPRINTF(CoalesceEngine, "%s: Pushed " + "processNextRead for input " + "%d to memoryFunctionQueue.\n", + __func__, block_index); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: " + "%s.\n", __func__, block_index, + cacheBlocks[block_index].to_string()); + } + } + // cacheBlocks[block_index].hasConflict = true; MSHR[block_index].push_back(addr); DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " "for cacheBlocks[%d].\n", __func__, addr, block_index); stats.readMisses++; + // TODO: Add readConflicts here. stats.numVertexReads++; - if ((cacheBlocks[block_index].busyMask == 0) && - (cacheBlocks[block_index].valid)) { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not " - "busy. It %s in the applyQueue.\n", - __func__, block_index, - applyQueue.find(block_index) ? "is" : "is not"); - if (!applyQueue.find(block_index)) { - applyQueue.push_back(block_index); - DPRINTF(CoalesceEngine, "%s: Added %d to " - "applyQueue. applyQueue.size = %u.\n", - __func__, block_index, applyQueue.size()); - } - assert(!applyQueue.empty()); - if ((!nextApplyEvent.scheduled())) { - schedule(nextApplyEvent, nextCycle()); - } - } return true; } else { - assert(!cacheBlocks[block_index].valid); - assert(MSHR[block_index].size() == 0); // MSHR available and no conflict DPRINTF(CoalesceEngine, "%s: Addr: %lu has no conflict. " "Allocating a cache line for it.\n" , __func__, addr); + assert(!cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].needsWB); + assert(!cacheBlocks[block_index].needsApply); + assert(!cacheBlocks[blokc_index].pendingData); + assert(!cacheBlocks[block_index].pendingApply); + assert(!cacheBlocks[block_index].pendingWB); + assert(MSHR[block_index].size() == 0); cacheBlocks[block_index].addr = aligned_addr; cacheBlocks[block_index].busyMask = 0; - cacheBlocks[block_index].allocated = true; cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].hasConflict = false; + cacheBlocks[block_index].needsWB = false; + cacheBlocks[block_index].needsApply = false; + cacheBlocks[block_index].pendingData = true; + cacheBlocks[block_index].pendingApply = false; + cacheBlocks[block_index].pendingWB = false; + // cacheBlocks[block_index].allocated = true; + // cacheBlocks[block_index].hasConflict = false; DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for" " Addr: %lu.\n", __func__, block_index, addr); MSHR[block_index].push_back(addr); @@ -234,6 +525,9 @@ CoalesceEngine::recvWLRead(Addr addr) (!nextMemoryEvent.scheduled())) { schedule(nextMemoryEvent, nextCycle()); } + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", + __func__, block_index, + cacheBlocks[block_index].to_string()); stats.readMisses++; stats.numVertexReads++; return true; @@ -241,7 +535,11 @@ CoalesceEngine::recvWLRead(Addr addr) } } else { DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for " - "Addr: %lu already in MSHRs.\n", __func__, block_index, addr); + "Addr: %lu already in MSHRs. It has a conflict " + "with addr: %lu.\n", __func__, block_index, addr, + cacheBlocks[block_index].addr); + assert(MSHR[block_index].size() <= numTgtsPerMSHR); + assert(MSHR[block_index].size() > 0); if (MSHR[block_index].size() == numTgtsPerMSHR) { DPRINTF(CoalesceEngine, "%s: Out of targets for " "cacheBlocks[%d]. Rejecting request.\n", @@ -249,21 +547,12 @@ CoalesceEngine::recvWLRead(Addr addr) stats.readRejections++; return false; } - if ((aligned_addr != cacheBlocks[block_index].addr)) { - DPRINTF(CoalesceEngine, "%s: Addr: %lu has a conflict " - "with Addr: %lu.\n", __func__, addr, - cacheBlocks[block_index].addr); - cacheBlocks[block_index].hasConflict = true; - } else { - DPRINTF(CoalesceEngine, "%s: There is room for another target " + DPRINTF(CoalesceEngine, "%s: There is room for another target " "for cacheBlocks[%d].\n", __func__, block_index); - } - if (aligned_addr != cacheBlocks[block_index].addr) { - stats.readMisses++; - } else { - stats.readHitUnderMisses++; - } + // cacheBlocks[block_index].hasConflict = true; + // TODO: Might want to differentiate between different misses. + stats.readMisses++; MSHR[block_index].push_back(addr); DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets for " @@ -324,8 +613,15 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) } if (cacheBlocks[block_index].addr == addr) { - assert(cacheBlocks[block_index].allocated); + DPRINTF(CoalesceEngine, "%s: Received read response to " + "fill cacheBlocks[%d].\n", __func__, block_index); assert(!cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].needsWB); + assert(!cacheBlocks[block_index].needsApply); + assert(cacheBlocks[block_index].pendingData); + assert(!cacheBlocks[block_index].pendingApply); + assert(!cacheBlocks[block_index].pendingWB); assert(MSHR.find(block_index) != MSHR.end()); pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); @@ -335,6 +631,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) cacheBlocks[block_index].items[i].to_string()); } cacheBlocks[block_index].valid = true; + cacheBlocks[block_index].pendingData = false; delete pkt; } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 7db09cec11..e7655a069e 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -34,6 +34,7 @@ #include "accl/graph/sega/base_memory_engine.hh" #include "accl/graph/base/data_structs.hh" #include "accl/graph/sega/push_engine.hh" +#include "base/cprintf.hh" #include "base/statistics.hh" #include "params/CoalesceEngine.hh" @@ -51,24 +52,42 @@ class CoalesceEngine : public BaseMemoryEngine { WorkListItem* items; Addr addr; - uint8_t busyMask; - bool allocated; + uint64_t busyMask; bool valid; + bool needsApply; + bool needsWB; + bool pendingData; + bool pendingApply; + bool pendingWB; + + bool allocated; bool hasConflict; - bool dirty; // TODO: This might be useful in the future // Tick lastWLWriteTick; Block() {} Block(int num_elements): addr(0), busyMask(0), - allocated(false), valid(false), - hasConflict(false), - dirty(false) + needsApply(false), + needsWB(false), + pendingData(false), + pendingApply(false), + pendingWB(false), + allocated(false), + hasConflict(false) { items = new WorkListItem [num_elements]; } + + std::string to_string() { + return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, " + "needsApply: %s, needsWB: %s, pendingData: %s, " + "pendingApply: %s, pendingWB: %s}", addr, busyMask, + valid ? "true" : "false", needsApply ? "true" : "false", + needsWB ? "true" : "false", pendingData ? "true" : "false", + pendingApply ? "true" : "false", pendingWB ? "true" : "false"); + } }; struct SenderState : public Packet::SenderState diff --git a/src/accl/graph/sega/state_machine.md b/src/accl/graph/sega/state_machine.md new file mode 100644 index 0000000000..203c47cf02 --- /dev/null +++ b/src/accl/graph/sega/state_machine.md @@ -0,0 +1 @@ +# CoalesceEngine Block state machine \ No newline at end of file From 2b2b27ce86cd7c6d692af11e3f3f42b712c4d31b Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 30 Jul 2022 23:14:08 -0700 Subject: [PATCH 138/247] Fixing cache block state machine. cont. wip --- src/accl/graph/sega/coalesce_engine.cc | 288 +++++++++---------------- 1 file changed, 98 insertions(+), 190 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index a0c85de2f5..8f33a2d893 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -105,177 +105,6 @@ CoalesceEngine::getBlockAddrFromBitIndex(int index) return peerMemoryRange.addIntlvBits(trimmed_addr); } -// TODO: Prev implementaton of recvWLRead. Remove -// bool -// CoalesceEngine::recvWLRead(Addr addr) -// { -// assert(MSHR.size() <= numMSHREntries); - -// Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); -// assert(aligned_addr % peerMemoryAtomSize == 0); -// int block_index = getBlockIndex(aligned_addr); -// assert(block_index < numLines); -// int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); -// assert(wl_offset < numElementsPerLine); -// DPRINTF(CoalesceEngine, "%s: Received a read request for addr: %lu. " -// "This request maps to cacheBlocks[%d], aligned_addr: " -// "%lu, and wl_offset: %d.\n", __func__, addr, -// block_index, aligned_addr, wl_offset); - -// if ((cacheBlocks[block_index].addr == aligned_addr) && -// (cacheBlocks[block_index].valid)) { -// assert(cacheBlocks[block_index].allocated); -// DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit.\n", __func__, addr); -// // Hit -// // TODO: Add a hit latency as a param for this object. -// // Can't just schedule the nextResponseEvent for latency cycles in -// // the future. -// responseQueue.push_back(std::make_tuple(addr, -// cacheBlocks[block_index].items[wl_offset])); -// DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " -// "to responseQueue. responseQueue.size = %d, " -// "responseQueueSize = %d.\n", __func__, addr, -// cacheBlocks[block_index].items[wl_offset].to_string(), -// responseQueue.size(), -// peerWLEngine->getRegisterFileSize()); -// DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " -// "to responseQueue. responseQueue.size = %d, " -// "responseQueueSize = %d.\n", __func__, addr, -// cacheBlocks[block_index].items[wl_offset].to_string(), -// responseQueue.size(), -// peerWLEngine->getRegisterFileSize()); -// // TODO: Stat to count the number of WLItems that have been touched. -// cacheBlocks[block_index].busyMask |= (1 << wl_offset); -// stats.readHits++; - -// if (!nextResponseEvent.scheduled()) { -// schedule(nextResponseEvent, nextCycle()); -// } -// stats.numVertexReads++; -// return true; -// } else { -// // miss -// DPRINTF(CoalesceEngine, "%s: Addr: %lu is a miss.\n", __func__, addr); -// if (MSHR.find(block_index) == MSHR.end()) { -// DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for Addr:" -// " %lu not found in MSHRs.\n", __func__, block_index, addr); -// assert(MSHR.size() <= numMSHREntries); -// if (MSHR.size() == numMSHREntries) { -// // Out of MSHR entries -// DPRINTF(CoalesceEngine, "%s: Out of MSHR entries. " -// "Rejecting request.\n", __func__); -// // TODO: Break out read rejections into more than one stat -// // based on the cause of the rejection -// stats.readRejections++; -// return false; -// } else { -// DPRINTF(CoalesceEngine, "%s: MSHR " -// "entries available.\n", __func__); -// if (cacheBlocks[block_index].allocated) { -// assert(MSHR[block_index].size() <= numTgtsPerMSHR); -// DPRINTF(CoalesceEngine, "%s: Addr: %lu has a conflict " -// "with Addr: %lu.\n", __func__, addr, -// cacheBlocks[block_index].addr); -// if (MSHR[block_index].size() == numTgtsPerMSHR) { -// DPRINTF(CoalesceEngine, "%s: Out of targets for " -// "cacheBlocks[%d]. Rejecting request.\n", -// __func__, block_index); -// stats.readRejections++; -// return false; -// } -// cacheBlocks[block_index].hasConflict = true; -// MSHR[block_index].push_back(addr); -// DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " -// "for cacheBlocks[%d].\n", __func__, addr, block_index); -// stats.readMisses++; -// stats.numVertexReads++; -// if ((cacheBlocks[block_index].busyMask == 0) && -// (cacheBlocks[block_index].valid)) { -// DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not " -// "busy. It %s in the applyQueue.\n", -// __func__, block_index, -// applyQueue.find(block_index) ? "is" : "is not"); -// if (!applyQueue.find(block_index)) { -// applyQueue.push_back(block_index); -// DPRINTF(CoalesceEngine, "%s: Added %d to " -// "applyQueue. applyQueue.size = %u.\n", -// __func__, block_index, applyQueue.size()); -// } -// assert(!applyQueue.empty()); -// if ((!nextApplyEvent.scheduled())) { -// schedule(nextApplyEvent, nextCycle()); -// } -// } -// return true; -// } else { -// assert(!cacheBlocks[block_index].valid); -// assert(MSHR[block_index].size() == 0); -// // MSHR available and no conflict -// DPRINTF(CoalesceEngine, "%s: Addr: %lu has no conflict. " -// "Allocating a cache line for it.\n" -// , __func__, addr); - -// cacheBlocks[block_index].addr = aligned_addr; -// cacheBlocks[block_index].busyMask = 0; -// cacheBlocks[block_index].allocated = true; -// cacheBlocks[block_index].valid = false; -// cacheBlocks[block_index].hasConflict = false; -// DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for" -// " Addr: %lu.\n", __func__, block_index, addr); -// MSHR[block_index].push_back(addr); -// DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " -// "for cacheBlocks[%d].\n", __func__, addr, block_index); -// memoryFunctionQueue.emplace_back( -// [this] (int block_index) { -// processNextRead(block_index); -// }, block_index); -// DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for " -// "input %d to memoryFunctionQueue.\n", -// __func__, block_index); -// if ((!nextMemoryEvent.pending()) && -// (!nextMemoryEvent.scheduled())) { -// schedule(nextMemoryEvent, nextCycle()); -// } -// stats.readMisses++; -// stats.numVertexReads++; -// return true; -// } -// } -// } else { -// DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for " -// "Addr: %lu already in MSHRs.\n", __func__, block_index, addr); -// if (MSHR[block_index].size() == numTgtsPerMSHR) { -// DPRINTF(CoalesceEngine, "%s: Out of targets for " -// "cacheBlocks[%d]. Rejecting request.\n", -// __func__, block_index); -// stats.readRejections++; -// return false; -// } -// if ((aligned_addr != cacheBlocks[block_index].addr)) { -// DPRINTF(CoalesceEngine, "%s: Addr: %lu has a conflict " -// "with Addr: %lu.\n", __func__, addr, -// cacheBlocks[block_index].addr); -// cacheBlocks[block_index].hasConflict = true; -// } else { -// DPRINTF(CoalesceEngine, "%s: There is room for another target " -// "for cacheBlocks[%d].\n", __func__, block_index); -// } - -// if (aligned_addr != cacheBlocks[block_index].addr) { -// stats.readMisses++; -// } else { -// stats.readHitUnderMisses++; -// } - -// MSHR[block_index].push_back(addr); -// DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets for " -// "cacheBlocks[%d].\n", __func__, addr, block_index); -// stats.numVertexReads++; -// return true; -// } -// } -// } - bool CoalesceEngine::recvWLRead(Addr addr) { @@ -615,6 +444,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) if (cacheBlocks[block_index].addr == addr) { DPRINTF(CoalesceEngine, "%s: Received read response to " "fill cacheBlocks[%d].\n", __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); assert(!cacheBlocks[block_index].valid); assert(cacheBlocks[block_index].busyMask == 0); assert(!cacheBlocks[block_index].needsWB); @@ -632,6 +463,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) } cacheBlocks[block_index].valid = true; cacheBlocks[block_index].pendingData = false; + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); delete pkt; } @@ -639,7 +472,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) std::vector servicedIndices; for (int i = 0; i < MSHR[block_index].size(); i++) { Addr miss_addr = MSHR[block_index][i]; - Addr aligned_miss_addr = roundDown(miss_addr, peerMemoryAtomSize); + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); if (aligned_miss_addr == addr) { int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " @@ -662,6 +496,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) peerWLEngine->getRegisterFileSize()); // TODO: Add a stat to count the number of WLItems that have been touched. cacheBlocks[block_index].busyMask |= (1 << wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); // End of the said block servicedIndices.push_back(i); // DPRINTF(CoalesceEngine, "%s: Added index: %d of MSHR for cacheBlocks[%d] for " @@ -677,15 +513,13 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) MSHR[block_index].erase(MSHR[block_index].begin() + servicedIndices[i] - bias); bias++; - DPRINTF(CoalesceEngine, "%s: Addr: %lu has been serviced and is removed.\n", - __func__, print_addr); + DPRINTF(CoalesceEngine, "%s: Addr: %lu has been serviced " + "and is removed.\n", __func__, print_addr); } if (MSHR[block_index].empty()) { MSHR.erase(block_index); - cacheBlocks[block_index].hasConflict = false; - } else { - assert(cacheBlocks[block_index].hasConflict); + // cacheBlocks[block_index].hasConflict = false; } if ((!nextResponseEvent.scheduled()) && @@ -726,37 +560,111 @@ CoalesceEngine::processNextResponseEvent() void CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) { - // TODO: Parameterize all the numbers here. Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); - // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines; int block_index = getBlockIndex(aligned_addr); int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); - - DPRINTF(CoalesceEngine, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n", - __func__, wl.to_string(), addr); + DPRINTF(CoalesceEngine, "%s: Received a write request for addr: %lu with " + "wl: %s. This request maps to cacheBlocks[%d], " + "aligned_addr: %lu, and wl_offset: %d.\n", + __func__, addr, wl.to_string(), + block_index, aligned_addr, wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + DPRINTF(CoalesceEngine, "%s: Received a write for WorkListItem: %s " + "with Addr: %lu.\n", __func__, wl.to_string(), addr); + // Desing does not allow for write misses for now. + assert(cacheBlocks[block_index].addr == aligned_addr); + // cache state asserts + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].busyMask != 0); + assert(!cacheBlocks[block_index].pendingData); + assert(!cacheBlocks[block_index].pendingApply); + assert(!cacheBlocks[block_index].pendingWB); + + // respective bit in busyMask for wl is set. assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) == (1 << wl_offset)); if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) { - cacheBlocks[block_index].dirty = true; + cacheBlocks[block_index].items[wl_offset] = wl; + cacheBlocks[block_index].needsApply |= true; + // NOTE: We don't set needsWB and rely on processNextApplyEvent to + // set that bit. stats.numVertexWrites++; } - cacheBlocks[block_index].items[wl_offset] = wl; cacheBlocks[block_index].busyMask &= ~(1 << wl_offset); DPRINTF(CoalesceEngine, "%s: Wrote to cacheBlocks[%d][%d] = %s.\n", __func__, block_index, wl_offset, cacheBlocks[block_index].items[wl_offset].to_string()); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); // TODO: Make this more general and programmable. if ((cacheBlocks[block_index].busyMask == 0)) { - DPRINTF(CoalesceEngine, "%s: Received all the expected writes for cacheBlocks[%d]." - " It does not have any taken items anymore.\n", - __func__, block_index); - applyQueue.push_back(block_index); - DPRINTF(CoalesceEngine, "%s: Added %d to applyQueue. applyQueue.size = %u.\n", - __func__, block_index, applyQueue.size()); + if (cacheBlocks[block_index].needsApply) { + cacheBlocks[block_index].pendingApply = true; + applyQueue.push_back(block_index); + DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to " + "applyQueue.\n", __func__, block_index); + } else { + assert(MSHR.size() <= numMSHREntries); + // cache line has conflict. + if (MSHR.find(block_index) != MSHR.end()) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending " + "conflict.\n", __func__, block_index); + if (cacheBlocks[block_index].needsWB) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write" + " back.\n", __func__, block_index); + cacheBlocks[block_index].pendingWB = true; + memoryFunctionQueue.emplace_back( + [this] (int block_index) { + processNextWriteBack(block_index); + }, block_index); + DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack " + "for input %d to memoryFunctionQueue.\n", + __func__, block_index); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + } else { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does not need" + " a write back.\n", __func__, block_index); + Addr miss_addr = MSHR[block_index].front(); + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + DPRINTF(CoalesceEngine, "%s: First conflicting address for" + " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n", + __func__, block_index, miss_addr, aligned_miss_addr); + cacheBlocks[block_index].addr = aligned_miss_addr; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].busyMask = 0; + cacheBlocks[block_index].needsWB = false; + cacheBlocks[block_index].needsApply = false; + cacheBlocks[block_index].pendingData = true; + cacheBlocks[block_index].pendingApply = false; + cacheBlocks[block_index].pendingWB = false; + memoryFunctionQueue.emplace_back( + [this] (int block_index) { + processNextRead(block_index); + }, block_index); + DPRINTF(CoalesceEngine, "%s: Pushed processNextRead " + "for input %d to memoryFunctionQueue.\n", + __func__, block_index); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + } + } else { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in " + "idle state now.\n", __func__, block_index); + } + } } + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); if ((!applyQueue.empty()) && (!nextApplyEvent.scheduled())) { From f138726a23ee6395f6c8f55a278677690cb57c83 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 31 Jul 2022 14:32:04 -0700 Subject: [PATCH 139/247] Completed cache block state machine. Needs rework of push interface. --- src/accl/graph/sega/coalesce_engine.cc | 205 +++++++++++++------------ src/accl/graph/sega/coalesce_engine.hh | 7 +- 2 files changed, 109 insertions(+), 103 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 8f33a2d893..904889f12b 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -198,7 +198,11 @@ CoalesceEngine::recvWLRead(Addr addr) return true; } else { // miss - assert(cacheBlocks[block_index].addr != aligned_addr); + // FIXME: Kake this assert work. It will break if the cache block + // is cold and addr or aligned_addr is 0. It fails because cache block + // addr field is initialized to 0. Unfortunately Addr type is unsigned. + // So you can not initialized addr to -1. + // assert(cacheBlocks[block_index].addr != aligned_addr); assert(MSHR.size() <= numMSHREntries); DPRINTF(CoalesceEngine, "%s: Addr: %lu is a miss.\n", __func__, addr); if (MSHR.find(block_index) == MSHR.end()) { @@ -220,14 +224,6 @@ CoalesceEngine::recvWLRead(Addr addr) DPRINTF(CoalesceEngine, "%s: Addr: %lu has a conflict " "with Addr: %lu.\n", __func__, addr, cacheBlocks[block_index].addr); - assert(MSHR[block_index].size() <= numTgtsPerMSHR); - if (MSHR[block_index].size() == numTgtsPerMSHR) { - DPRINTF(CoalesceEngine, "%s: Out of targets for " - "cacheBlocks[%d]. Rejecting request.\n", - __func__, block_index); - stats.readRejections++; - return false; - } if ((cacheBlocks[block_index].valid) && (cacheBlocks[block_index].busyMask == 0) && (!cacheBlocks[block_index].pendingApply) && @@ -288,7 +284,7 @@ CoalesceEngine::recvWLRead(Addr addr) cacheBlocks[block_index].needsApply = false; cacheBlocks[block_index].pendingData = true; cacheBlocks[block_index].pendingApply = false; - cacheBlocks[block_index].pendingWB = true; + cacheBlocks[block_index].pendingWB = false; memoryFunctionQueue.emplace_back( [this] (int block_index) { processNextRead(block_index); @@ -323,7 +319,7 @@ CoalesceEngine::recvWLRead(Addr addr) assert(cacheBlocks[block_index].busyMask == 0); assert(!cacheBlocks[block_index].needsWB); assert(!cacheBlocks[block_index].needsApply); - assert(!cacheBlocks[blokc_index].pendingData); + assert(!cacheBlocks[block_index].pendingData); assert(!cacheBlocks[block_index].pendingApply); assert(!cacheBlocks[block_index].pendingWB); assert(MSHR[block_index].size() == 0); @@ -607,6 +603,10 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) applyQueue.push_back(block_index); DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to " "applyQueue.\n", __func__, block_index); + if ((!applyQueue.empty()) && + (!nextApplyEvent.scheduled())) { + schedule(nextApplyEvent, nextCycle()); + } } else { assert(MSHR.size() <= numMSHREntries); // cache line has conflict. @@ -666,70 +666,71 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); - if ((!applyQueue.empty()) && - (!nextApplyEvent.scheduled())) { - schedule(nextApplyEvent, nextCycle()); - } - } void CoalesceEngine::processNextApplyEvent() { int block_index = applyQueue.front(); + DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. " + "cacheBlock[%d] to be applied.\n", __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", + __func__, cacheBlocks[block_index].to_string()); + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].needsApply); + assert(!cacheBlocks[block_index].pendingData); + assert(!cacheBlocks[block_index].pendingWB); - if (cacheBlocks[block_index].busyMask != 0) { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been taken amid " - "apply process. Therefore, ignoring the apply schedule.\n", - __func__, block_index); - stats.falseApplySchedules++; - } else if (!cacheBlocks[block_index].dirty) { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has no change. " - "Therefore, no apply needed.\n", __func__, block_index); - } else { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] could be applied.\n", - __func__, block_index); - for (int i = 0; i < numElementsPerLine; i++) { - uint32_t old_prop = cacheBlocks[block_index].items[i].prop; - uint32_t new_prop = std::min( - cacheBlocks[block_index].items[i].prop, - cacheBlocks[block_index].items[i].tempProp); - - if (new_prop != old_prop) { - cacheBlocks[block_index].items[i].tempProp = new_prop; - cacheBlocks[block_index].items[i].prop = new_prop; - DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n", __func__, - cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)), - cacheBlocks[block_index].items[i].to_string()); - int bit_index = - getBitIndexBase(cacheBlocks[block_index].addr) + i; - if ((cacheBlocks[block_index].items[i].degree != 0) && - (needsPush[bit_index] == 0)) { - // If the respective bit in the bit vector is set - // there is no need to try and resend it. + if (cacheBlocks[block_index].pendingApply) { + assert(cacheBlocks[block_index].busyMask == 0); + for (int index = 0; index < numElementsPerLine; index++) { + uint32_t current_prop = cacheBlocks[block_index].items[index].prop; + uint32_t new_prop = std::min(current_prop, + cacheBlocks[block_index].items[index].tempProp); + if (new_prop != current_prop) { + cacheBlocks[block_index].items[index].tempProp = new_prop; + cacheBlocks[block_index].items[index].prop = new_prop; + DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu][%d]: %s.\n", + __func__, cacheBlocks[block_index].addr, index, + cacheBlocks[block_index].items[index].to_string()); + + int bit_index_base = + getBitIndexBase(cacheBlocks[block_index].addr); + if ((needsPush[bit_index_base + index] == 0) && + (cacheBlocks[block_index].items[index].degree != 0)) { if (peerPushEngine->allocatePushSpace()) { peerPushEngine->recvWLItem( - cacheBlocks[block_index].items[i]); + cacheBlocks[block_index].items[index]); } else { - needsPush[bit_index] = 1; + needsPush[bit_index_base + index] = 1; } } } } - } + cacheBlocks[block_index].needsWB = true; + cacheBlocks[block_index].needsApply = false; + cacheBlocks[block_index].pendingApply = false; - // TODO: This is where eviction policy goes - if ((cacheBlocks[block_index].hasConflict) && - (cacheBlocks[block_index].busyMask == 0)) { - memoryFunctionQueue.emplace_back([this] (int block_index) { + assert(MSHR.size() < numMSHREntries); + if (MSHR.find(block_index) != MSHR.end()) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending " + "conflicts.\n", __func__, block_index); + cacheBlocks[block_index].pendingWB = true; + memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBack(block_index); }, block_index); - DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input %d " - "to memoryFunctionQueue.\n", __func__, block_index); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); + DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input" + " %d to memoryFunctionQueue.\n", __func__, block_index); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + } else { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in " + "idle state now.\n", __func__, block_index); } + DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); } applyQueue.pop_front(); @@ -770,6 +771,17 @@ CoalesceEngine::processNextMemoryEvent() void CoalesceEngine::processNextRead(int block_index) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n", + __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", + __func__, block_index, cacheBlocks[block_index].to_string()); + assert(!cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].needsWB); + assert(!cacheBlocks[block_index].needsApply); + assert(cacheBlocks[block_index].pendingData); + assert(!cacheBlocks[block_index].pendingApply); + assert(!cacheBlocks[block_index].pendingWB); PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr, peerMemoryAtomSize); DPRINTF(CoalesceEngine, "%s: Created a read packet. addr = %lu, " @@ -781,54 +793,53 @@ CoalesceEngine::processNextRead(int block_index) void CoalesceEngine::processNextWriteBack(int block_index) { - // Why would we write it back if it does not have a conflict? - assert(cacheBlocks[block_index].hasConflict); - - if ((cacheBlocks[block_index].busyMask != 0) || - (applyQueue.find(block_index))) { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been taken amid " - "writeback process. Therefore, ignoring the apply schedule.\n", - __func__, block_index); - // FIXME: Fix the name of this stat. - stats.falseEvictSchedules++; - } else { - if (cacheBlocks[block_index].dirty) { - DPRINTF(CoalesceEngine, "%s: Change observed on " - "cacheBlocks[%d].\n", __func__, block_index); - PacketPtr write_pkt = createWritePacket( + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n", + __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].needsWB); + assert(!cacheBlocks[block_index].pendingData); + assert(!cacheBlocks[block_index].pendingApply); + + // Why would we write it back if it does not have a conflict. + assert(MSHR.size() <= numMSHREntries); + assert(MSHR.find(block_index) != MSHR.end()); + if (cacheBlocks[block_index].pendingWB) { + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].needsApply); + PacketPtr pkt = createWritePacket( cacheBlocks[block_index].addr, peerMemoryAtomSize, (uint8_t*) cacheBlocks[block_index].items); - DPRINTF(CoalesceEngine, "%s: Created a write packet to " + DPRINTF(CoalesceEngine, "%s: Created a write packet to " "Addr: %lu, size = %d.\n", __func__, - write_pkt->getAddr(), write_pkt->getSize()); - memPort.sendPacket(write_pkt); - } else { - DPRINTF(CoalesceEngine, "%s: No change observed on " - "cacheBlocks[%d]. No write back needed.\n", - __func__, block_index); - } - assert(!MSHR[block_index].empty()); + pkt->getAddr(), pkt->getSize()); + memPort.sendPacket(pkt); + cacheBlocks[block_index].needsWB = false; + cacheBlocks[block_index].pendingWB = false; + Addr miss_addr = MSHR[block_index].front(); - DPRINTF(CoalesceEngine, "%s: First conflicting address for " - "cacheBlocks[%d] is Addr: %lu.\n", - __func__, block_index, miss_addr); Addr aligned_miss_addr = - roundDown(miss_addr, peerMemoryAtomSize); + roundDown(miss_addr, peerMemoryAtomSize); + DPRINTF(CoalesceEngine, "%s: First conflicting address for" + " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n", + __func__, block_index, miss_addr, aligned_miss_addr); cacheBlocks[block_index].addr = aligned_miss_addr; - cacheBlocks[block_index].busyMask = 0; - cacheBlocks[block_index].allocated = true; cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].hasConflict = true; - cacheBlocks[block_index].dirty = false; - DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for " - "Addr: %lu.\n", __func__, block_index, aligned_miss_addr); - + cacheBlocks[block_index].busyMask = 0; + cacheBlocks[block_index].needsWB = false; + cacheBlocks[block_index].needsApply = false; + cacheBlocks[block_index].pendingData = true; + cacheBlocks[block_index].pendingApply = false; + cacheBlocks[block_index].pendingWB = false; memoryFunctionQueue.emplace_back([this] (int block_index) { - processNextRead(block_index); - }, block_index); - DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input %d to " - "memoryFunctionQueue.\n", __func__, block_index); + processNextRead(block_index); + }, block_index); + DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input" + " %d to memoryFunctionQueue.\n", __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); } } @@ -866,7 +877,7 @@ CoalesceEngine::getOptimalBitVectorSlice() // } return std::make_tuple(true, it); } else if (!((cacheBlocks[block_index].addr == addr) && - (cacheBlocks[block_index].allocated))) { + (cacheBlocks[block_index].pendingData))) { // score += numElementsPerLine; // if (current_score > score) { // score = current_score; diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index e7655a069e..2ba0b62aaf 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -59,9 +59,6 @@ class CoalesceEngine : public BaseMemoryEngine bool pendingData; bool pendingApply; bool pendingWB; - - bool allocated; - bool hasConflict; // TODO: This might be useful in the future // Tick lastWLWriteTick; Block() {} @@ -73,9 +70,7 @@ class CoalesceEngine : public BaseMemoryEngine needsWB(false), pendingData(false), pendingApply(false), - pendingWB(false), - allocated(false), - hasConflict(false) + pendingWB(false) { items = new WorkListItem [num_elements]; } From 4138a240b59a7d1da2370ff87d2848787a85ec09 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 2 Aug 2022 22:33:54 -0700 Subject: [PATCH 140/247] Fixing scheduling error of memory functions. --- src/accl/graph/SConscript | 32 ----- src/accl/graph/base/data_structs.hh | 2 +- src/accl/graph/sega/SConscript | 9 +- src/accl/graph/sega/coalesce_engine.cc | 176 ++++++++++++++----------- src/accl/graph/sega/coalesce_engine.hh | 24 ++-- 5 files changed, 120 insertions(+), 123 deletions(-) delete mode 100644 src/accl/graph/SConscript diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript deleted file mode 100644 index 5dffd1a396..0000000000 --- a/src/accl/graph/SConscript +++ /dev/null @@ -1,32 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (c) 2016 Jason Lowe-Power -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -Import('*') - -DebugFlag('SEGAStructureSize') -CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', - 'WLEngine', 'BaseMemoryEngine']) diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index 707b57c56f..830f1ecc16 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -47,7 +47,7 @@ struct __attribute__ ((packed)) WorkListItem std::string to_string() { return csprintf( - "WorkListItem{temp_prop: %u, prop: %u, degree: %u, edgeIndex: %u}", + "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}", tempProp, prop, degree, edgeIndex); } diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index 81a29df6af..4c398b5ccd 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -39,10 +39,15 @@ Source('coalesce_engine.cc') Source('push_engine.cc') Source('wl_engine.cc') -DebugFlag('BaseMemoryEngine') DebugFlag('ApplyUpdates') +DebugFlag('BaseMemoryEngine') +DebugFlag('BitVector') DebugFlag('CenteralController') -DebugFlag('CoalesceEngine') DebugFlag('CacheBlockState') +DebugFlag('CoalesceEngine') DebugFlag('PushEngine') +DebugFlag('SEGAStructureSize') DebugFlag('WLEngine') + +CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', + 'WLEngine', 'BaseMemoryEngine']) \ No newline at end of file diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 904889f12b..da2bc54c19 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -33,8 +33,9 @@ #include "accl/graph/sega/wl_engine.hh" #include "base/intmath.hh" #include "debug/ApplyUpdates.hh" -#include "debug/CoalesceEngine.hh" +#include "debug/BitVector.hh" #include "debug/CacheBlockState.hh" +#include "debug/CoalesceEngine.hh" #include "debug/SEGAStructureSize.hh" #include "mem/packet_access.hh" @@ -76,6 +77,13 @@ CoalesceEngine::registerWLEngine(WLEngine* wl_engine) peerWLEngine = wl_engine; } +DrainState +CoalesceEngine::drain() +{ + DPRINTF(CoalesceEngine, "%s: drain called.\n"); + return DrainState::Drained; +} + // addr should be aligned to peerMemoryAtomSize int CoalesceEngine::getBlockIndex(Addr addr) @@ -156,6 +164,7 @@ CoalesceEngine::recvWLRead(Addr addr) // and skip the process if the respective bit is set to false. cacheBlocks[block_index].pendingApply = false; cacheBlocks[block_index].pendingWB = false; + cacheBlocks[block_index].lastChangedTick = curTick(); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); @@ -198,7 +207,7 @@ CoalesceEngine::recvWLRead(Addr addr) return true; } else { // miss - // FIXME: Kake this assert work. It will break if the cache block + // FIXME: Make this assert work. It will break if the cache block // is cold and addr or aligned_addr is 0. It fails because cache block // addr field is initialized to 0. Unfortunately Addr type is unsigned. // So you can not initialized addr to -1. @@ -258,10 +267,11 @@ CoalesceEngine::recvWLRead(Addr addr) DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs" "to be written back.\n", __func__, block_index); cacheBlocks[block_index].pendingWB = true; + cacheBlocks[block_index].lastChangedTick = curTick(); memoryFunctionQueue.emplace_back( - [this] (int block_index) { - processNextWriteBack(block_index); - }, block_index); + [this] (int block_index, Tick schedule_tick) { + processNextWriteBack(block_index, schedule_tick); + }, block_index, curTick()); DPRINTF(CoalesceEngine, "%s: Pushed " "processNextWriteBack for input " "%d to memoryFunctionQueue.\n", @@ -274,7 +284,7 @@ CoalesceEngine::recvWLRead(Addr addr) "%s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); } else { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does" + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does " "not need to be written back.\n", __func__, block_index); cacheBlocks[block_index].addr = aligned_addr; @@ -285,10 +295,11 @@ CoalesceEngine::recvWLRead(Addr addr) cacheBlocks[block_index].pendingData = true; cacheBlocks[block_index].pendingApply = false; cacheBlocks[block_index].pendingWB = false; + cacheBlocks[block_index].lastChangedTick = curTick(); memoryFunctionQueue.emplace_back( - [this] (int block_index) { - processNextRead(block_index); - }, block_index); + [this] (int block_index, Tick schedule_tick) { + processNextRead(block_index, schedule_tick); + }, block_index, curTick()); DPRINTF(CoalesceEngine, "%s: Pushed " "processNextRead for input " "%d to memoryFunctionQueue.\n", @@ -332,17 +343,16 @@ CoalesceEngine::recvWLRead(Addr addr) cacheBlocks[block_index].pendingData = true; cacheBlocks[block_index].pendingApply = false; cacheBlocks[block_index].pendingWB = false; - // cacheBlocks[block_index].allocated = true; - // cacheBlocks[block_index].hasConflict = false; + cacheBlocks[block_index].lastChangedTick = curTick(); DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for" " Addr: %lu.\n", __func__, block_index, addr); MSHR[block_index].push_back(addr); DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " "for cacheBlocks[%d].\n", __func__, addr, block_index); memoryFunctionQueue.emplace_back( - [this] (int block_index) { - processNextRead(block_index); - }, block_index); + [this] (int block_index, Tick schedule_tick) { + processNextRead(block_index, schedule_tick); + }, block_index, curTick()); DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for " "input %d to memoryFunctionQueue.\n", __func__, block_index); @@ -415,7 +425,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) WorkListItem* items = pkt->getPtr(); int push_needed = 0; // No applying of the line needed. - DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", + DPRINTF(BitVector, "%s: needsPush.count: %d.\n", __func__, needsPush.count()); assert(peerPushEngine->getNumRetries() == needsPush.count()); for (int i = 0; i < numElementsPerLine; i++) { @@ -427,7 +437,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) push_needed += needsPush[it + i]; needsPush[it + i] = 0; } - DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", + DPRINTF(BitVector, "%s: needsPush.count: %d.\n", __func__, needsPush.count()); peerPushEngine->deallocatePushSpace( numElementsPerLine - push_needed); @@ -459,6 +469,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) } cacheBlocks[block_index].valid = true; cacheBlocks[block_index].pendingData = false; + cacheBlocks[block_index].lastChangedTick = curTick(); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); delete pkt; @@ -492,6 +503,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) peerWLEngine->getRegisterFileSize()); // TODO: Add a stat to count the number of WLItems that have been touched. cacheBlocks[block_index].busyMask |= (1 << wl_offset); + cacheBlocks[block_index].lastChangedTick = curTick(); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); // End of the said block @@ -590,6 +602,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) } cacheBlocks[block_index].busyMask &= ~(1 << wl_offset); + cacheBlocks[block_index].lastChangedTick = curTick(); DPRINTF(CoalesceEngine, "%s: Wrote to cacheBlocks[%d][%d] = %s.\n", __func__, block_index, wl_offset, cacheBlocks[block_index].items[wl_offset].to_string()); @@ -600,6 +613,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) if ((cacheBlocks[block_index].busyMask == 0)) { if (cacheBlocks[block_index].needsApply) { cacheBlocks[block_index].pendingApply = true; + cacheBlocks[block_index].lastChangedTick = curTick(); applyQueue.push_back(block_index); DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to " "applyQueue.\n", __func__, block_index); @@ -617,10 +631,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write" " back.\n", __func__, block_index); cacheBlocks[block_index].pendingWB = true; + cacheBlocks[block_index].lastChangedTick = curTick(); memoryFunctionQueue.emplace_back( - [this] (int block_index) { - processNextWriteBack(block_index); - }, block_index); + [this] (int block_index, Tick schedule_tick) { + processNextWriteBack(block_index, schedule_tick); + }, block_index, curTick()); DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack " "for input %d to memoryFunctionQueue.\n", __func__, block_index); @@ -645,10 +660,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) cacheBlocks[block_index].pendingData = true; cacheBlocks[block_index].pendingApply = false; cacheBlocks[block_index].pendingWB = false; + cacheBlocks[block_index].lastChangedTick = curTick(); memoryFunctionQueue.emplace_back( - [this] (int block_index) { - processNextRead(block_index); - }, block_index); + [this] (int block_index, Tick schedule_tick) { + processNextRead(block_index, schedule_tick); + }, block_index, curTick()); DPRINTF(CoalesceEngine, "%s: Pushed processNextRead " "for input %d to memoryFunctionQueue.\n", __func__, block_index); @@ -710,15 +726,18 @@ CoalesceEngine::processNextApplyEvent() cacheBlocks[block_index].needsWB = true; cacheBlocks[block_index].needsApply = false; cacheBlocks[block_index].pendingApply = false; + cacheBlocks[block_index].lastChangedTick = curTick(); assert(MSHR.size() < numMSHREntries); if (MSHR.find(block_index) != MSHR.end()) { DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending " "conflicts.\n", __func__, block_index); cacheBlocks[block_index].pendingWB = true; - memoryFunctionQueue.emplace_back([this] (int block_index) { - processNextWriteBack(block_index); - }, block_index); + cacheBlocks[block_index].lastChangedTick = curTick(); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextWriteBack(block_index, schedule_tick); + }, block_index, curTick()); DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input" " %d to memoryFunctionQueue.\n", __func__, block_index); if ((!nextMemoryEvent.pending()) && @@ -750,12 +769,14 @@ CoalesceEngine::processNextMemoryEvent() DPRINTF(CoalesceEngine, "%s: Processing another " "memory function.\n", __func__); - std::function next_memory_function; + std::function next_memory_function; int next_memory_function_input; + Tick next_memory_function_tick; std::tie( next_memory_function, - next_memory_function_input) = memoryFunctionQueue.front(); - next_memory_function(next_memory_function_input); + next_memory_function_input, + next_memory_function_tick) = memoryFunctionQueue.front(); + next_memory_function(next_memory_function_input, next_memory_function_tick); memoryFunctionQueue.pop_front(); DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. " "memoryFunctionQueue.size = %d.\n", __func__, @@ -769,12 +790,16 @@ CoalesceEngine::processNextMemoryEvent() } void -CoalesceEngine::processNextRead(int block_index) +CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) { DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n", __func__, block_index); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); + // A cache block should not be touched while it's waiting for data. + assert(schedule_tick == cacheBlocks[block_index].lastChangedTick); + // + assert(!cacheBlocks[block_index].valid); assert(cacheBlocks[block_index].busyMask == 0); assert(!cacheBlocks[block_index].needsWB); @@ -791,23 +816,25 @@ CoalesceEngine::processNextRead(int block_index) } void -CoalesceEngine::processNextWriteBack(int block_index) +CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) { DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n", __func__, block_index); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); - assert(cacheBlocks[block_index].valid); - assert(cacheBlocks[block_index].needsWB); - assert(!cacheBlocks[block_index].pendingData); - assert(!cacheBlocks[block_index].pendingApply); - - // Why would we write it back if it does not have a conflict. - assert(MSHR.size() <= numMSHREntries); - assert(MSHR.find(block_index) != MSHR.end()); - if (cacheBlocks[block_index].pendingWB) { + if (schedule_tick == cacheBlocks[block_index].lastChangedTick) { + assert(cacheBlocks[block_index].valid); assert(cacheBlocks[block_index].busyMask == 0); + assert(cacheBlocks[block_index].needsWB); assert(!cacheBlocks[block_index].needsApply); + assert(!cacheBlocks[block_index].pendingData); + assert(!cacheBlocks[block_index].pendingApply); + assert(cacheBlocks[block_index].pendingWB); + + // Why would we write it back if it does not have a conflict. + assert(MSHR.size() <= numMSHREntries); + assert(MSHR.find(block_index) != MSHR.end()); + PacketPtr pkt = createWritePacket( cacheBlocks[block_index].addr, peerMemoryAtomSize, (uint8_t*) cacheBlocks[block_index].items); @@ -833,13 +860,21 @@ CoalesceEngine::processNextWriteBack(int block_index) cacheBlocks[block_index].pendingData = true; cacheBlocks[block_index].pendingApply = false; cacheBlocks[block_index].pendingWB = false; - memoryFunctionQueue.emplace_back([this] (int block_index) { - processNextRead(block_index); - }, block_index); + cacheBlocks[block_index].lastChangedTick = curTick(); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextRead(block_index, schedule_tick); + }, block_index, curTick()); DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input" " %d to memoryFunctionQueue.\n", __func__, block_index); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); + } else { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a " + "write back has been scheduled for it. Ignoring " + "the current write back scheduled at tick %lu for " + "the right function scheduled later.\n", + __func__, block_index, schedule_tick); } } @@ -863,9 +898,14 @@ CoalesceEngine::getOptimalBitVectorSlice() // current_score += current_popcount; Addr addr = getBlockAddrFromBitIndex(it); int block_index = getBlockIndex(addr); - if ((cacheBlocks[block_index].valid) && - (cacheBlocks[block_index].addr == addr) && - (cacheBlocks[block_index].busyMask == 0)) { + // Idle state: valid && !pendingApply && !pendingWB + if ((cacheBlocks[block_index].addr == addr) && + (cacheBlocks[block_index].valid) && + (cacheBlocks[block_index].busyMask == 0) && + (!cacheBlocks[block_index].pendingApply) && + (!cacheBlocks[block_index].pendingWB)) { + assert(!cacheBlocks[block_index].needsApply); + assert(!cacheBlocks[block_index].pendingData); // current_score += numElementsPerLine * 2; // if (current_score > score) { // score = current_score; @@ -876,8 +916,7 @@ CoalesceEngine::getOptimalBitVectorSlice() // } // } return std::make_tuple(true, it); - } else if (!((cacheBlocks[block_index].addr == addr) && - (cacheBlocks[block_index].pendingData))) { + } else if (cacheBlocks[block_index].addr != addr) { // score += numElementsPerLine; // if (current_score > score) { // score = current_score; @@ -893,7 +932,7 @@ CoalesceEngine::getOptimalBitVectorSlice() } void -CoalesceEngine::processNextPushRetry(int slice_base_2) +CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick) { bool hit_in_cache; int slice_base; @@ -907,17 +946,11 @@ CoalesceEngine::processNextPushRetry(int slice_base_2) assert(cacheBlocks[block_index].busyMask == 0); int push_needed = 0; - DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", + DPRINTF(BitVector, "%s: needsPush.count: %d.\n", __func__, needsPush.count()); assert(peerPushEngine->getNumRetries() == needsPush.count()); for (int i = 0; i < numElementsPerLine; i++) { - // TODO: Make this more programmable - uint32_t new_prop = std::min( - cacheBlocks[block_index].items[i].prop, - cacheBlocks[block_index].items[i].tempProp); - cacheBlocks[block_index].items[i].tempProp = new_prop; - cacheBlocks[block_index].items[i].prop = new_prop; if (needsPush[slice_base + i] == 1) { peerPushEngine->recvWLItemRetry( cacheBlocks[block_index].items[i]); @@ -925,24 +958,11 @@ CoalesceEngine::processNextPushRetry(int slice_base_2) push_needed += needsPush[slice_base + i]; needsPush[slice_base + i] = 0; } - DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n", + DPRINTF(BitVector, "%s: needsPush.count: %d.\n", __func__, needsPush.count()); - peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed); + peerPushEngine->deallocatePushSpace( + numElementsPerLine - push_needed); assert(peerPushEngine->getNumRetries() == needsPush.count()); - if (applyQueue.find(block_index)) { - applyQueue.erase(block_index); - if (applyQueue.empty() && nextApplyEvent.scheduled()) { - deschedule(nextApplyEvent); - } - if (cacheBlocks[block_index].hasConflict) { - memoryFunctionQueue.emplace_back([this] (int block_index) { - processNextWriteBack(block_index); - }, block_index); - DPRINTF(CoalesceEngine, "%s: Pushed nextWriteBackEvent for" - " input %d to memoryFunctionQueue.\n", - __func__, block_index); - } - } } else { PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); SenderState* sender_state = new SenderState(true); @@ -958,9 +978,10 @@ CoalesceEngine::processNextPushRetry(int slice_base_2) } if (numRetriesReceived > 0) { - memoryFunctionQueue.emplace_back([this] (int slice_base) { - processNextPushRetry(slice_base); - }, 0); + memoryFunctionQueue.emplace_back( + [this] (int slice_base, Tick schedule_tick) { + processNextPushRetry(slice_base, schedule_tick); + }, 0, curTick()); DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input " "0 to memoryFunctionQueue.\n", __func__); } @@ -990,9 +1011,10 @@ CoalesceEngine::recvPushRetry() assert(numRetriesReceived == 1); // TODO: Pass slice_base to getOptimalBitVectorSlice - memoryFunctionQueue.emplace_back([this] (int slice_base) { - processNextPushRetry(slice_base); - }, 0); + memoryFunctionQueue.emplace_back( + [this] (int slice_base, Tick schedule_tick) { + processNextPushRetry(slice_base, schedule_tick); + }, 0, curTick()); DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input 0 to " "memoryFunctionQueue.\n", __func__); if ((!nextMemoryEvent.pending()) && diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 2ba0b62aaf..ce6e0daca6 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -59,6 +59,7 @@ class CoalesceEngine : public BaseMemoryEngine bool pendingData; bool pendingApply; bool pendingWB; + Tick lastChangedTick; // TODO: This might be useful in the future // Tick lastWLWriteTick; Block() {} @@ -70,7 +71,8 @@ class CoalesceEngine : public BaseMemoryEngine needsWB(false), pendingData(false), pendingApply(false), - pendingWB(false) + pendingWB(false), + lastChangedTick(0) { items = new WorkListItem [num_elements]; } @@ -78,10 +80,11 @@ class CoalesceEngine : public BaseMemoryEngine std::string to_string() { return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, " "needsApply: %s, needsWB: %s, pendingData: %s, " - "pendingApply: %s, pendingWB: %s}", addr, busyMask, - valid ? "true" : "false", needsApply ? "true" : "false", - needsWB ? "true" : "false", pendingData ? "true" : "false", - pendingApply ? "true" : "false", pendingWB ? "true" : "false"); + "pendingApply: %s, pendingWB: %s, lastChangedTick: %lu}", + addr, busyMask, valid ? "true" : "false", + needsApply ? "true" : "false", needsWB ? "true" : "false", + pendingData ? "true" : "false", pendingApply ? "true" : "false", + pendingWB ? "true" : "false", lastChangedTick); } }; @@ -114,10 +117,10 @@ class CoalesceEngine : public BaseMemoryEngine MemoryEvent nextMemoryEvent; void processNextMemoryEvent(); - void processNextRead(int block_index); - void processNextWriteBack(int block_index); - void processNextPushRetry(int slice_base); - std::deque, int>> memoryFunctionQueue; + void processNextRead(int block_index, Tick schedule_tick); + void processNextWriteBack(int block_index, Tick schedule_tick); + void processNextPushRetry(int slice_base, Tick schedule_tick); + std::deque, int, Tick>> memoryFunctionQueue; EventFunctionWrapper nextResponseEvent; void processNextResponseEvent(); @@ -151,12 +154,11 @@ class CoalesceEngine : public BaseMemoryEngine public: PARAMS(CoalesceEngine); - CoalesceEngine(const Params ¶ms); + virtual DrainState drain() override; bool recvWLRead(Addr addr); void recvWLWrite(Addr addr, WorkListItem wl); - void registerWLEngine(WLEngine* wl_engine); void recvPushRetry(); From 1194dc3ec83a9b78acfa4487cbd2552eed74c317 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 3 Aug 2022 12:41:28 -0700 Subject: [PATCH 141/247] Fixing incorrect assert. --- src/accl/graph/sega/coalesce_engine.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index da2bc54c19..21dd746aad 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -728,7 +728,7 @@ CoalesceEngine::processNextApplyEvent() cacheBlocks[block_index].pendingApply = false; cacheBlocks[block_index].lastChangedTick = curTick(); - assert(MSHR.size() < numMSHREntries); + assert(MSHR.size() <= numMSHREntries); if (MSHR.find(block_index) != MSHR.end()) { DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending " "conflicts.\n", __func__, block_index); From c1d92aed296ca6827fb75047216c32efbe477b98 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 5 Aug 2022 13:37:54 -0700 Subject: [PATCH 142/247] Updating memory address mapping and interface for push coalesce. --- configs/accl/sega.py | 30 ++++++++++------- src/accl/graph/base/base_reduce_engine.cc | 2 +- src/accl/graph/base/base_reduce_engine.hh | 3 +- src/accl/graph/base/data_structs.hh | 19 +++++++++++ src/accl/graph/sega/PushEngine.py | 3 +- src/accl/graph/sega/push_engine.cc | 40 ++++++++++++++++------- src/accl/graph/sega/push_engine.hh | 35 +++++++++++++++----- 7 files changed, 96 insertions(+), 36 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 7577331f2b..26488ef69d 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -8,20 +8,23 @@ class MPU(SubSystem): def __init__(self, base_edge_addr): super(MPU, self).__init__() - self.push_engine = PushEngine(base_edge_addr=base_edge_addr, - push_req_queue_size=2, + self.push_engine = PushEngine(base_edge_addr=0, + push_req_queue_size=32, attached_memory_atom_size=64, - outstanding_mem_req_queue_size=1, - resp_queue_size=1) + resp_queue_size=64) + # self.push_engine = PushEngine(base_edge_addr=base_edge_addr, + # push_req_queue_size=32, + # attached_memory_atom_size=64, + # resp_queue_size=64) self.coalesce_engine = CoalesceEngine( peer_push_engine=self.push_engine, attached_memory_atom_size=32, - cache_size="32B", - num_mshr_entry=1, - num_tgts_per_mshr=1) + cache_size="8MiB", + num_mshr_entry=32, + num_tgts_per_mshr=16) self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, - update_queue_size=1, - on_the_fly_update_map_size=1) + update_queue_size=64, + register_file_size=32) def getRespPort(self): return self.wl_engine.resp_port @@ -74,10 +77,15 @@ def __init__(self, latency="30ns") ) edge_mem_ctrl.append( - SimpleMemory(range=self._edge_ranges[i], + # SimpleMemory(range=self._edge_ranges[i], + # bandwidth="4.8GB/s", + # latency="30ns", + # image_file=f"{graph_path}/edgelist_{i}") + SimpleMemory(range=AddrRange(self._edge_chunk_size), bandwidth="4.8GB/s", latency="30ns", - image_file=f"{graph_path}/edgelist_{i}") + image_file=f"{graph_path}/edgelist_{i}", + in_addr_map=False) ) self.vertex_mem_ctrl = vertex_mem_ctrl self.edge_mem_ctrl = edge_mem_ctrl diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc index 38a8662ed0..ade95800d2 100644 --- a/src/accl/graph/base/base_reduce_engine.cc +++ b/src/accl/graph/base/base_reduce_engine.cc @@ -31,7 +31,7 @@ namespace gem5 { -BaseReduceEngine::BaseReduceEngine(const BaseReduceEngineParams ¶ms): +BaseReduceEngine::BaseReduceEngine(const Params ¶ms): ClockedObject(params), system(params.system), _requestorId(system->getRequestorId(this)) diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh index c8c9784ed1..268bb60b76 100644 --- a/src/accl/graph/base/base_reduce_engine.hh +++ b/src/accl/graph/base/base_reduce_engine.hh @@ -47,8 +47,7 @@ class BaseReduceEngine : public ClockedObject public: PARAMS(BaseReduceEngine); - - BaseReduceEngine(const BaseReduceEngineParams ¶ms); + BaseReduceEngine(const Params ¶ms); ~BaseReduceEngine(); RequestorID requestorId() { return _requestorId; } diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index 830f1ecc16..6f775d8a38 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -78,15 +78,34 @@ struct __attribute__ ((packed)) Edge return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor); } + Edge(): weight(0), neighbor(0) {} + Edge(uint16_t weight, uint64_t neighbor): weight(weight), neighbor(neighbor) {} + }; static_assert(isPowerOf2(sizeof(WorkListItem))); static_assert(isPowerOf2(sizeof(Edge))); +struct CompleteEdge { + uint64_t src; + uint64_t dst; + uint32_t weight; + + CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight): + src(src), dst(dst), weight(weight) + {} + + std::string to_string() + { + return csprintf("CompleteEdge{src: %lu, dst:%lu, weight: %u}", + src, dst, weight); + } +}; + template class UniqueFIFO { diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py index 447731219e..a45f5d6ead 100644 --- a/src/accl/graph/sega/PushEngine.py +++ b/src/accl/graph/sega/PushEngine.py @@ -35,8 +35,7 @@ class PushEngine(BaseMemoryEngine): cxx_class = 'gem5::PushEngine' req_port = RequestPort("Port to send updates to the outside") - base_edge_addr = Param.Addr("The base address for the " - "attached edge memory") + push_req_queue_size = Param.Int("Size of the queue to " "queue push requests.") # resp_queue_size should probably be diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index d87462d7dd..d071e8fd37 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -38,7 +38,6 @@ namespace gem5 PushEngine::PushEngine(const Params ¶ms): BaseMemoryEngine(params), reqPort(name() + ".req_port", this), - baseEdgeAddr(params.base_edge_addr), pushReqQueueSize(params.push_req_queue_size), numTotalRetries(0), numPendingRetries(0), onTheFlyMemReqs(0), @@ -140,12 +139,12 @@ PushEngine::recvWLItem(WorkListItem wl) "checking if there is enough push space. Use allocatePushSpace.\n"); DPRINTF(PushEngine, "%s: Received %s.\n", __func__, wl.to_string()); - Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge)); + Addr start_addr = wl.edgeIndex * sizeof(Edge); Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); uint32_t value = wl.prop; pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), - peerMemoryAtomSize, value); + peerMemoryAtomSize, value, 0); DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n", __func__, pushReqQueue.size()); @@ -162,12 +161,12 @@ PushEngine::recvWLItemRetry(WorkListItem wl) DPRINTF(PushEngine, "%s: Received %s with retry.\n", __func__, wl.to_string()); - Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge)); + Addr start_addr = wl.edgeIndex * sizeof(Edge); Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); uint32_t value = wl.prop; pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), - peerMemoryAtomSize, value); + peerMemoryAtomSize, value, 0); assert(pushReqQueue.size() <= pushReqQueueSize); DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n", __func__, pushReqQueue.size()); @@ -191,22 +190,24 @@ PushEngine::processNextMemoryReadEvent() Addr aligned_addr, offset; int num_edges; - PushPacketInfoGen &curr_info = pushReqQueue.front(); + EdgeReadInfoGen &curr_info = pushReqQueue.front(); std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo(); DPRINTF(PushEngine, "%s: Current packet information generated by " - "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, " + "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, " "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges); PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize); reqOffsetMap[pkt->req] = offset; reqNumEdgeMap[pkt->req] = num_edges; reqValueMap[pkt->req] = curr_info.value(); + PushInfo push_info = {curr_info.src(), curr_info.value(), offset, num_edges}; + reqInfoMap[pkt->req] = push_info; memPort.sendPacket(pkt); onTheFlyMemReqs++; if (curr_info.done()) { - DPRINTF(PushEngine, "%s: Current PushPacketInfoGen is done.\n", __func__); + DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__); pushReqQueue.pop_front(); DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. " "pushReqQueue.size() = %u.\n", @@ -228,9 +229,6 @@ PushEngine::processNextMemoryReadEvent() } } - // if ((!nextMemoryReadEvent.scheduled()) && (!pushReqQueue.empty())) { - // schedule(nextMemoryReadEvent, nextCycle()); - // } if (!pushReqQueue.empty()) { assert(!nextMemoryReadEvent.pending()); assert(!nextMemoryReadEvent.scheduled()); @@ -265,6 +263,20 @@ PushEngine::handleMemResp(PacketPtr pkt) onTheFlyMemReqs--; assert(memRespQueue.size() <= memRespQueueSize); + uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize]; + PushInfo push_info = reqInfoMap[pkt->req]; + pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize); + + std::vector edges; + for (int i = 0; i < push_info.numElements; i++) { + Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge)); + Addr edge_dst = edge->neighbor; + uint32_t edge_weight = edge->weight; + edges.emplace_back(push_info.src, edge_dst, edge_weight); + } + edgeQueue.push_back(edges); + delete pkt_data; + if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) { schedule(nextPushEvent, nextCycle()); } @@ -288,6 +300,12 @@ PushEngine::processNextPushEvent() Edge* curr_edge = (Edge*) (data + offset); + std::vector& current_edges = edgeQueue.front(); + while(!current_edges.empty()) { + CompleteEdge curr_edge = current_edges.back(); + DPRINTF(PushEngine, "%s: %s.\n", __func__, curr_edge.to_string()); + current_edges.pop_back(); + } // TODO: Implement propagate function here uint32_t update_value = value + 1; PacketPtr update = createUpdatePacket( diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 9b182e2251..7fb6c42579 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -42,19 +42,21 @@ class CoalesceEngine; class PushEngine : public BaseMemoryEngine { private: - class PushPacketInfoGen { + class EdgeReadInfoGen { private: Addr _start; Addr _end; size_t _step; size_t _atom; + uint32_t _value; + Addr _src; public: - PushPacketInfoGen(Addr start, Addr end, size_t step, - size_t atom, uint32_t value): - _start(start), _end(end), _step(step), - _atom(atom), _value(value) + EdgeReadInfoGen(Addr start, Addr end, size_t step, + size_t atom, uint32_t value, Addr src): + _start(start), _end(end), _step(step), + _atom(atom), _value(value), _src(src) {} std::tuple nextReadPacketInfo() @@ -74,8 +76,17 @@ class PushEngine : public BaseMemoryEngine return std::make_tuple(aligned_addr, offset, num_items); } - uint32_t value() { return _value; } bool done() { return (_start >= _end); } + + Addr src() { return _src; } + uint32_t value() { return _value; } + }; + + struct PushInfo { + Addr src; + uint32_t value; + Addr offset; + int numElements; }; class ReqPort : public RequestPort @@ -98,26 +109,27 @@ class PushEngine : public BaseMemoryEngine virtual void recvReqRetry(); }; + bool _running; int numElementsPerLine; CoalesceEngine* peerCoalesceEngine; ReqPort reqPort; - Addr baseEdgeAddr; - int pushReqQueueSize; int numTotalRetries; int numPendingRetries; - std::deque pushReqQueue; + std::deque pushReqQueue; // TODO: Add size one size for all these maps std::unordered_map reqOffsetMap; std::unordered_map reqNumEdgeMap; std::unordered_map reqValueMap; + std::unordered_map reqInfoMap; int onTheFlyMemReqs; int memRespQueueSize; std::deque memRespQueue; + std::deque> edgeQueue; template PacketPtr createUpdatePacket(Addr addr, T value); @@ -167,6 +179,11 @@ class PushEngine : public BaseMemoryEngine int getNumRetries() { return numTotalRetries; } + void start(); // CoalesceEngine announcing work + void stop(); // CoalesceEngine announcing no work + bool running() { return _running; } + void recvWLItem2(Addr addr, WorkListItem wl); + }; } From 371f2b600c6b24ad2bdcb3f434284c06b22cff04 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 12 Aug 2022 08:32:42 -0700 Subject: [PATCH 143/247] Implemented pullVertex. --- configs/accl/sega.py | 7 +- src/accl/graph/base/data_structs.hh | 5 +- src/accl/graph/sega/SConscript | 1 + src/accl/graph/sega/base_memory_engine.cc | 8 +- src/accl/graph/sega/coalesce_engine.cc | 71 +++--- src/accl/graph/sega/coalesce_engine.hh | 6 +- src/accl/graph/sega/push_engine.cc | 257 +++++++++------------- src/accl/graph/sega/push_engine.hh | 52 ++--- 8 files changed, 167 insertions(+), 240 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 26488ef69d..e7a704d477 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -6,10 +6,9 @@ from m5.util.convert import toMemorySize class MPU(SubSystem): - def __init__(self, base_edge_addr): + def __init__(self): super(MPU, self).__init__() - self.push_engine = PushEngine(base_edge_addr=0, - push_req_queue_size=32, + self.push_engine = PushEngine(push_req_queue_size=32, attached_memory_atom_size=64, resp_queue_size=64) # self.push_engine = PushEngine(base_edge_addr=base_edge_addr, @@ -151,7 +150,7 @@ def __init__(self, mpus = [] for i in range(num_mpus): - mpus.append(MPU(base_edge_addr=self.mem_ctrl.getEdgeBaseAddr(i))) + mpus.append(MPU()) mpus[i].setReqPort(self.interconnect.cpu_side_ports) mpus[i].setRespPort(self.interconnect.mem_side_ports) mpus[i].setVertexMemPort(self.mem_ctrl.getVertexPort(i)) diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index 6f775d8a38..026a3cb7b2 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -94,9 +94,10 @@ struct CompleteEdge { uint64_t src; uint64_t dst; uint32_t weight; + uint32_t value; - CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight): - src(src), dst(dst), weight(weight) + CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value): + src(src), dst(dst), weight(weight), value(value) {} std::string to_string() diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index 4c398b5ccd..ae216ccdd4 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -47,6 +47,7 @@ DebugFlag('CacheBlockState') DebugFlag('CoalesceEngine') DebugFlag('PushEngine') DebugFlag('SEGAStructureSize') +DebugFlag('TempFlag') DebugFlag('WLEngine') CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc index a5d1d7e8e7..9bd1941b23 100644 --- a/src/accl/graph/sega/base_memory_engine.cc +++ b/src/accl/graph/sega/base_memory_engine.cc @@ -99,11 +99,9 @@ BaseMemoryEngine::MemPort::recvReqRetry() "Received retry without a blockedPacket"); _blocked = false; - sendPacket(blockedPacket); - - if (!blocked()) { - blockedPacket = nullptr; - } + PacketPtr pkt = blockedPacket; + blockedPacket = nullptr; + sendPacket(pkt); } PacketPtr diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 21dd746aad..dcec2a5f78 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -47,8 +47,9 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): peerPushEngine(params.peer_push_engine), numLines((int) (params.cache_size / peerMemoryAtomSize)), numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), - numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), - numRetriesReceived(0), + numMSHREntries(params.num_mshr_entry), + numTgtsPerMSHR(params.num_tgts_per_mshr), + _workCount(0), numPullsReceived(0), nextMemoryEvent([this] { processNextMemoryEvent(); }, name() + ".nextMemoryEvent"), @@ -423,26 +424,20 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) "for addr %lu. It was not found in the cache.\n", __func__, addr); WorkListItem* items = pkt->getPtr(); - int push_needed = 0; // No applying of the line needed. DPRINTF(BitVector, "%s: needsPush.count: %d.\n", __func__, needsPush.count()); - assert(peerPushEngine->getNumRetries() == needsPush.count()); for (int i = 0; i < numElementsPerLine; i++) { - assert(!((needsPush[it + i] == 1) && - (items[i].degree == 0))); + Addr vertex_addr = addr + i * sizeof(WorkListItem); if (needsPush[it + i] == 1) { - peerPushEngine->recvWLItemRetry(items[i]); + _workCount--; + needsPush[it + i] = 0; + peerPushEngine->recvVertexPush(vertex_addr, items[i]); + break; } - push_needed += needsPush[it + i]; - needsPush[it + i] = 0; } DPRINTF(BitVector, "%s: needsPush.count: %d.\n", __func__, needsPush.count()); - peerPushEngine->deallocatePushSpace( - numElementsPerLine - push_needed); - assert(peerPushEngine->getNumRetries() == needsPush.count()); - // } delete pkt; return true; } @@ -691,7 +686,7 @@ CoalesceEngine::processNextApplyEvent() DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. " "cacheBlock[%d] to be applied.\n", __func__, block_index); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", - __func__, cacheBlocks[block_index].to_string()); + __func__, block_index, cacheBlocks[block_index].to_string()); assert(cacheBlocks[block_index].valid); assert(cacheBlocks[block_index].needsApply); assert(!cacheBlocks[block_index].pendingData); @@ -712,14 +707,15 @@ CoalesceEngine::processNextApplyEvent() int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr); - if ((needsPush[bit_index_base + index] == 0) && - (cacheBlocks[block_index].items[index].degree != 0)) { - if (peerPushEngine->allocatePushSpace()) { - peerPushEngine->recvWLItem( - cacheBlocks[block_index].items[index]); - } else { + + if (cacheBlocks[block_index].items[index].degree > 0) { + if (needsPush[bit_index_base + index] == 0) { + _workCount++; needsPush[bit_index_base + index] = 1; } + if (!peerPushEngine->running()) { + peerPushEngine->start(); + } } } } @@ -945,24 +941,20 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick) assert(cacheBlocks[block_index].valid); assert(cacheBlocks[block_index].busyMask == 0); - int push_needed = 0; DPRINTF(BitVector, "%s: needsPush.count: %d.\n", __func__, needsPush.count()); - assert(peerPushEngine->getNumRetries() == needsPush.count()); - for (int i = 0; i < numElementsPerLine; i++) { + Addr vertex_addr = addr + i * sizeof(WorkListItem); if (needsPush[slice_base + i] == 1) { - peerPushEngine->recvWLItemRetry( - cacheBlocks[block_index].items[i]); + _workCount--; + needsPush[slice_base + i] = 0; + peerPushEngine->recvVertexPush(vertex_addr, + cacheBlocks[block_index].items[i]); + break; } - push_needed += needsPush[slice_base + i]; - needsPush[slice_base + i] = 0; } DPRINTF(BitVector, "%s: needsPush.count: %d.\n", __func__, needsPush.count()); - peerPushEngine->deallocatePushSpace( - numElementsPerLine - push_needed); - assert(peerPushEngine->getNumRetries() == needsPush.count()); } else { PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); SenderState* sender_state = new SenderState(true); @@ -973,11 +965,10 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick) // a flag to true (maybe not even needed just look if the cache has a // line allocated for it in the cacheBlocks). } - numRetriesReceived--; - assert(numRetriesReceived == 0); + numPullsReceived--; } - if (numRetriesReceived > 0) { + if (numPullsReceived > 0) { memoryFunctionQueue.emplace_back( [this] (int slice_base, Tick schedule_tick) { processNextPushRetry(slice_base, schedule_tick); @@ -1002,29 +993,19 @@ CoalesceEngine::recvMemRetry() } void -CoalesceEngine::recvPushRetry() +CoalesceEngine::recvVertexPull() { - numRetriesReceived++; - DPRINTF(CoalesceEngine, "%s: Received a push retry.\n", __func__); - // For now since we do only one retry at a time, we should not receive - // a retry while this nextSendingRetryEvent is scheduled or is pending. - assert(numRetriesReceived == 1); - - // TODO: Pass slice_base to getOptimalBitVectorSlice + numPullsReceived++; memoryFunctionQueue.emplace_back( [this] (int slice_base, Tick schedule_tick) { processNextPushRetry(slice_base, schedule_tick); }, 0, curTick()); - DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input 0 to " - "memoryFunctionQueue.\n", __func__); if ((!nextMemoryEvent.pending()) && (!nextMemoryEvent.scheduled())) { schedule(nextMemoryEvent, nextCycle()); } } - - CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) : statistics::Group(&_coalesce), coalesce(_coalesce), diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index ce6e0daca6..6969fe2823 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -106,7 +106,8 @@ class CoalesceEngine : public BaseMemoryEngine std::unordered_map> MSHR; std::deque> responseQueue; - int numRetriesReceived; + int _workCount; + int numPullsReceived; UniqueFIFO applyQueue; std::bitset needsPush; @@ -161,7 +162,8 @@ class CoalesceEngine : public BaseMemoryEngine void recvWLWrite(Addr addr, WorkListItem wl); void registerWLEngine(WLEngine* wl_engine); - void recvPushRetry(); + int workCount() { return _workCount; } + void recvVertexPull(); }; } diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index d071e8fd37..b5341b3d61 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -30,6 +30,7 @@ #include "accl/graph/sega/coalesce_engine.hh" #include "debug/PushEngine.hh" +#include "debug/TempFlag.hh" #include "mem/packet_access.hh" namespace gem5 @@ -38,13 +39,12 @@ namespace gem5 PushEngine::PushEngine(const Params ¶ms): BaseMemoryEngine(params), reqPort(name() + ".req_port", this), - pushReqQueueSize(params.push_req_queue_size), - numTotalRetries(0), numPendingRetries(0), - onTheFlyMemReqs(0), - memRespQueueSize(params.resp_queue_size), + _running(false), + numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size), + onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size), + nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()), nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()), nextPushEvent([this] { processNextPushEvent(); }, name()), - nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()), stats(*this) {} @@ -66,15 +66,31 @@ PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine, numElementsPerLine = elements_per_line; } +void +PushEngine::recvReqRetry() +{ + DPRINTF(PushEngine, "%s: Received a req retry.\n", __func__); + if (nextPushEvent.pending()) { + nextPushEvent.wake(); + schedule(nextPushEvent, nextCycle()); + } +} + void PushEngine::ReqPort::sendPacket(PacketPtr pkt) { panic_if(_blocked, "Should never try to send if blocked MemSide!"); // If we can't send the packet across the port, store it for later. + DPRINTF(PushEngine, "%s: Sending pakcet: %s to " + "the network.\n", __func__, pkt->print()); if (!sendTimingReq(pkt)) { blockedPacket = pkt; _blocked = true; + DPRINTF(PushEngine, "%s: MemPort blocked.\n", __func__); + } else { + DPRINTF(PushEngine, "%s: Packet sent successfully.\n", __func__); + owner->recvReqRetry(); } } @@ -92,86 +108,73 @@ PushEngine::ReqPort::recvReqRetry() DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__); _blocked = false; - sendPacket(blockedPacket); - - if (!_blocked) { - blockedPacket = nullptr; - DPRINTF(PushEngine, "%s: Sent the blockedPacket. " - "_blocked: %s, (blockedPacket == nullptr): %s.\n", - __func__, _blocked ? "true" : "false", - (blockedPacket == nullptr) ? "true" : "false"); - } + PacketPtr pkt = blockedPacket; + blockedPacket = nullptr; + sendPacket(pkt); +} + +bool +PushEngine::vertexSpace() +{ + return (edgePointerQueueSize == 0) || + ((edgePointerQueue.size() + numPendingPulls) < edgePointerQueueSize); +} + +bool +PushEngine::workLeft() +{ + return ((peerCoalesceEngine->workCount() - numPendingPulls) > 0); } void -PushEngine::deallocatePushSpace(int space) +PushEngine::start() { - /// DISCUSS: Might have to check whether the addrGenEvent is scheduled - // and or the pushReqQueue is empty. If so we might need to - // send retries. - DPRINTF(PushEngine, "%s: Received reported %d free spaces.\n", - __func__, space); - numPendingRetries--; - if (numTotalRetries > 0) { - int free_space = pushReqQueueSize - - (pushReqQueue.size() + (numPendingRetries * numElementsPerLine)); - DPRINTF(PushEngine, "%s: pushReqQueue has at least %d " - "free spaces.\n", __func__, free_space); - if ((free_space >= numElementsPerLine) && - (numPendingRetries == 0)) { - DPRINTF(PushEngine, "%s: Sent a push retry to " - "peerCoalesceEngine.\n", __func__); - assert(!nextSendRetryEvent.scheduled()); - schedule(nextSendRetryEvent, nextCycle()); - } + assert(!_running); + assert(!nextVertexPullEvent.scheduled()); + + _running = true; + // NOTE: We might have to check for size availability here. + assert(workLeft()); + if (vertexSpace()) { + schedule(nextVertexPullEvent, nextCycle()); } } void -PushEngine::recvWLItem(WorkListItem wl) +PushEngine::processNextVertexPullEvent() { - assert(wl.degree != 0); - - assert((pushReqQueueSize == 0) || - (pushReqQueue.size() < pushReqQueueSize)); - panic_if((pushReqQueue.size() == pushReqQueueSize) && - (pushReqQueueSize != 0), "You should call this method after " - "checking if there is enough push space. Use allocatePushSpace.\n"); + // TODO: change edgePointerQueueSize + numPendingPulls++; + peerCoalesceEngine->recvVertexPull(); - DPRINTF(PushEngine, "%s: Received %s.\n", __func__, wl.to_string()); - Addr start_addr = wl.edgeIndex * sizeof(Edge); - Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); - uint32_t value = wl.prop; - - pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), - peerMemoryAtomSize, value, 0); - DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n", - __func__, pushReqQueue.size()); + if (!workLeft()) { + _running = false; + } - if ((!nextMemoryReadEvent.pending()) && - (!nextMemoryReadEvent.scheduled())) { - schedule(nextMemoryReadEvent, nextCycle()); + if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) { + schedule(nextVertexPullEvent, nextCycle()); } } void -PushEngine::recvWLItemRetry(WorkListItem wl) +PushEngine::recvVertexPush(Addr addr, WorkListItem wl) { - assert(wl.degree != 0); - DPRINTF(PushEngine, "%s: Received %s with retry.\n", - __func__, wl.to_string()); + assert(wl.degree > 0); + assert((edgePointerQueueSize == 0) || + ((edgePointerQueue.size() + numPendingPulls) <= edgePointerQueueSize)); Addr start_addr = wl.edgeIndex * sizeof(Edge); Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); - uint32_t value = wl.prop; - pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), - peerMemoryAtomSize, value, 0); - assert(pushReqQueue.size() <= pushReqQueueSize); - DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n", - __func__, pushReqQueue.size()); + edgePointerQueue.emplace_back(start_addr, end_addr, sizeof(Edge), + peerMemoryAtomSize, addr, (uint32_t) wl.prop); + numPendingPulls--; + DPRINTF(TempFlag, "%s: Received {addr: %lu, wl: %s}.\n", + __func__, addr, wl.to_string()); + if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) { + schedule(nextVertexPullEvent, nextCycle()); + } - numTotalRetries--; if ((!nextMemoryReadEvent.pending()) && (!nextMemoryReadEvent.scheduled())) { schedule(nextMemoryReadEvent, nextCycle()); @@ -186,20 +189,17 @@ PushEngine::processNextMemoryReadEvent() return; } - if (memRespQueue.size() < (memRespQueueSize - onTheFlyMemReqs)) { + if (edgeQueue.size() < (edgeQueueSize - onTheFlyMemReqs)) { Addr aligned_addr, offset; int num_edges; - EdgeReadInfoGen &curr_info = pushReqQueue.front(); + EdgeReadInfoGen &curr_info = edgePointerQueue.front(); std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo(); DPRINTF(PushEngine, "%s: Current packet information generated by " "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, " "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges); PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize); - reqOffsetMap[pkt->req] = offset; - reqNumEdgeMap[pkt->req] = num_edges; - reqValueMap[pkt->req] = curr_info.value(); PushInfo push_info = {curr_info.src(), curr_info.value(), offset, num_edges}; reqInfoMap[pkt->req] = push_info; @@ -208,42 +208,23 @@ PushEngine::processNextMemoryReadEvent() if (curr_info.done()) { DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__); - pushReqQueue.pop_front(); - DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. " - "pushReqQueue.size() = %u.\n", - __func__, pushReqQueue.size()); - if (numTotalRetries > 0) { - int free_space = pushReqQueueSize - - (pushReqQueue.size() + (numPendingRetries * numElementsPerLine)); - DPRINTF(PushEngine, "%s: pushReqQueue has at least %d" - " free spaces.\n", __func__, free_space); - if ((free_space >= numElementsPerLine) && - (numPendingRetries == 0)) { - DPRINTF(PushEngine, "%s: Sent a push retry to " - "peerCoalesceEngine.\n", __func__); - if (!nextSendRetryEvent.scheduled()) { - schedule(nextSendRetryEvent, nextCycle()); - } - } - } + edgePointerQueue.pop_front(); + DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. " + "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size()); } } - if (!pushReqQueue.empty()) { + if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) { + schedule(nextVertexPullEvent, nextCycle()); + } + + if (!edgePointerQueue.empty()) { assert(!nextMemoryReadEvent.pending()); assert(!nextMemoryReadEvent.scheduled()); schedule(nextMemoryReadEvent, nextCycle()); } } -void -PushEngine::processNextSendRetryEvent() -{ - assert(numPendingRetries == 0); - numPendingRetries++; - peerCoalesceEngine->recvPushRetry(); -} - void PushEngine::recvMemRetry() { @@ -259,25 +240,27 @@ PushEngine::handleMemResp(PacketPtr pkt) { // TODO: in case we need to edit edges, get rid of second statement. assert(pkt->isResponse() && (!pkt->isWrite())); - memRespQueue.push_back(pkt); - onTheFlyMemReqs--; - assert(memRespQueue.size() <= memRespQueueSize); uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize]; PushInfo push_info = reqInfoMap[pkt->req]; pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize); - std::vector edges; + std::deque edges; for (int i = 0; i < push_info.numElements; i++) { Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge)); Addr edge_dst = edge->neighbor; uint32_t edge_weight = edge->weight; - edges.emplace_back(push_info.src, edge_dst, edge_weight); + edges.emplace_back(push_info.src, edge_dst, + edge_weight, push_info.value); } edgeQueue.push_back(edges); + onTheFlyMemReqs--; + reqInfoMap.erase(pkt->req); delete pkt_data; + delete pkt; - if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) { + if ((!nextPushEvent.pending()) && + (!nextPushEvent.scheduled())) { schedule(nextPushEvent, nextCycle()); } return true; @@ -287,50 +270,37 @@ PushEngine::handleMemResp(PacketPtr pkt) void PushEngine::processNextPushEvent() { - PacketPtr pkt = memRespQueue.front(); - uint8_t* data = pkt->getPtr(); - - Addr offset = reqOffsetMap[pkt->req]; - assert(offset < peerMemoryAtomSize); - uint32_t value = reqValueMap[pkt->req]; + if (reqPort.blocked()) { + nextPushEvent.sleep(); + return; + } - DPRINTF(PushEngine, "%s: Looking at the front of the queue. pkt->Addr: %lu, " - "offset: %lu\n", - __func__, pkt->getAddr(), offset); + std::deque& edge_list = edgeQueue.front(); + CompleteEdge curr_edge = edge_list.front(); - Edge* curr_edge = (Edge*) (data + offset); + DPRINTF(PushEngine, "%s: The edge to process is %s.\n", + __func__, curr_edge.to_string()); - std::vector& current_edges = edgeQueue.front(); - while(!current_edges.empty()) { - CompleteEdge curr_edge = current_edges.back(); - DPRINTF(PushEngine, "%s: %s.\n", __func__, curr_edge.to_string()); - current_edges.pop_back(); - } // TODO: Implement propagate function here - uint32_t update_value = value + 1; + uint32_t update_value = curr_edge.value + 1; PacketPtr update = createUpdatePacket( - curr_edge->neighbor, update_value); - - if (!reqPort.blocked()) { - reqPort.sendPacket(update); - stats.numUpdates++; - DPRINTF(PushEngine, "%s: Sent a push update to addr: %lu with value: %d.\n", - __func__, curr_edge->neighbor, update_value); - reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge); - assert(reqOffsetMap[pkt->req] <= peerMemoryAtomSize); - reqNumEdgeMap[pkt->req]--; - assert(reqNumEdgeMap[pkt->req] >= 0); - } + curr_edge.dst, update_value); + + reqPort.sendPacket(update); + stats.numUpdates++; + DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to addr: %lu " + "with value: %d.\n", __func__, curr_edge.src, + curr_edge.dst, update_value); + - if (reqNumEdgeMap[pkt->req] == 0) { - reqOffsetMap.erase(pkt->req); - reqNumEdgeMap.erase(pkt->req); - reqValueMap.erase(pkt->req); - memRespQueue.pop_front(); - delete pkt; + edge_list.pop_front(); + if (edge_list.empty()) { + edgeQueue.pop_front(); } - if (!nextPushEvent.scheduled() && !memRespQueue.empty()) { + assert(!nextPushEvent.pending()); + assert(!nextPushEvent.scheduled()); + if (!edgeQueue.empty()) { schedule(nextPushEvent, nextCycle()); } } @@ -354,17 +324,6 @@ PushEngine::createUpdatePacket(Addr addr, T value) return pkt; } -bool -PushEngine::allocatePushSpace() { - if ((pushReqQueueSize == 0) || - ((pushReqQueue.size() < pushReqQueueSize) && (numTotalRetries == 0))) { - return true; - } else { - numTotalRetries++; - return false; - } -} - PushEngine::PushStats::PushStats(PushEngine &_push) : statistics::Group(&_push), push(_push), diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 7fb6c42579..c79b0de944 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -49,14 +49,14 @@ class PushEngine : public BaseMemoryEngine size_t _step; size_t _atom; - uint32_t _value; Addr _src; + uint32_t _value; public: EdgeReadInfoGen(Addr start, Addr end, size_t step, - size_t atom, uint32_t value, Addr src): + size_t atom, Addr src, uint32_t value): _start(start), _end(end), _step(step), - _atom(atom), _value(value), _src(src) + _atom(atom), _src(src), _value(value) {} std::tuple nextReadPacketInfo() @@ -109,38 +109,34 @@ class PushEngine : public BaseMemoryEngine virtual void recvReqRetry(); }; + ReqPort reqPort; + bool _running; int numElementsPerLine; CoalesceEngine* peerCoalesceEngine; - ReqPort reqPort; - - int pushReqQueueSize; - int numTotalRetries; - int numPendingRetries; - std::deque pushReqQueue; - - // TODO: Add size one size for all these maps - std::unordered_map reqOffsetMap; - std::unordered_map reqNumEdgeMap; - std::unordered_map reqValueMap; + int numPendingPulls; + int edgePointerQueueSize; + std::deque edgePointerQueue; std::unordered_map reqInfoMap; int onTheFlyMemReqs; - int memRespQueueSize; - std::deque memRespQueue; - std::deque> edgeQueue; + int edgeQueueSize; + std::deque> edgeQueue; template PacketPtr createUpdatePacket(Addr addr, T value); + EventFunctionWrapper nextVertexPullEvent; + void processNextVertexPullEvent(); + MemoryEvent nextMemoryReadEvent; void processNextMemoryReadEvent(); - EventFunctionWrapper nextPushEvent; + MemoryEvent nextPushEvent; void processNextPushEvent(); - EventFunctionWrapper nextSendRetryEvent; - void processNextSendRetryEvent(); + bool vertexSpace(); + bool workLeft(); struct PushStats : public statistics::Group { @@ -166,24 +162,14 @@ class PushEngine : public BaseMemoryEngine Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; - bool allocatePushSpace(); - - void deallocatePushSpace(int space); - - void recvWLItem(WorkListItem wl); - - void recvWLItemRetry(WorkListItem wl); - void registerCoalesceEngine(CoalesceEngine* coalesce_engine, int elements_per_line); - int getNumRetries() { return numTotalRetries; } + void recvReqRetry(); - void start(); // CoalesceEngine announcing work - void stop(); // CoalesceEngine announcing no work + void start(); bool running() { return _running; } - void recvWLItem2(Addr addr, WorkListItem wl); - + void recvVertexPush(Addr addr, WorkListItem wl); }; } From 34d8bcef6633e9019c3fd4d3921044eb5bebedeb Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 22 Aug 2022 11:51:06 -0700 Subject: [PATCH 144/247] Added sim exit functionality. WIP --- src/accl/graph/sega/centeral_controller.hh | 2 +- src/accl/graph/sega/coalesce_engine.cc | 7 +++++++ src/accl/graph/sega/coalesce_engine.hh | 2 ++ src/accl/graph/sega/push_engine.cc | 11 +++++++++++ src/accl/graph/sega/push_engine.hh | 8 +++++--- src/accl/graph/sega/wl_engine.cc | 6 ++++++ src/accl/graph/sega/wl_engine.hh | 3 ++- 7 files changed, 34 insertions(+), 5 deletions(-) diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index 102800de92..1f325703bd 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -52,7 +52,7 @@ class CenteralController : public ClockedObject RequestPort(name, owner), owner(owner), _blocked(false), blockedPacket(nullptr) {} - // virtual AddrRangeList getAddrRanges() const; + void sendPacket(PacketPtr pkt); bool blocked() { return _blocked; } diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index dcec2a5f78..57bc99013c 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -85,6 +85,13 @@ CoalesceEngine::drain() return DrainState::Drained; } +bool +CoalesceEngine::done() +{ + return needsPush.none() && + memoryFunctionQueue.empty() && peerWLEngine->done(); +} + // addr should be aligned to peerMemoryAtomSize int CoalesceEngine::getBlockIndex(Addr addr) diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 6969fe2823..b19a1bc461 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -164,6 +164,8 @@ class CoalesceEngine : public BaseMemoryEngine int workCount() { return _workCount; } void recvVertexPull(); + + bool done(); }; } diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index b5341b3d61..9866c30f5c 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -32,6 +32,7 @@ #include "debug/PushEngine.hh" #include "debug/TempFlag.hh" #include "mem/packet_access.hh" +#include "sim/sim_exit.hh" namespace gem5 { @@ -126,6 +127,12 @@ PushEngine::workLeft() return ((peerCoalesceEngine->workCount() - numPendingPulls) > 0); } +bool +PushEngine::done() +{ + return edgeQueue.empty() && + edgePointerQueue.empty() && peerCoalesceEngine->done(); +} void PushEngine::start() { @@ -298,6 +305,10 @@ PushEngine::processNextPushEvent() edgeQueue.pop_front(); } + if (done()) { + exitSimLoopNow(name() + " is done."); + } + assert(!nextPushEvent.pending()); assert(!nextPushEvent.scheduled()); if (!edgeQueue.empty()) { diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index c79b0de944..a42228f4c0 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -126,6 +126,9 @@ class PushEngine : public BaseMemoryEngine template PacketPtr createUpdatePacket(Addr addr, T value); + bool vertexSpace(); + bool workLeft(); + EventFunctionWrapper nextVertexPullEvent; void processNextVertexPullEvent(); @@ -135,9 +138,6 @@ class PushEngine : public BaseMemoryEngine MemoryEvent nextPushEvent; void processNextPushEvent(); - bool vertexSpace(); - bool workLeft(); - struct PushStats : public statistics::Group { PushStats(PushEngine &push); @@ -170,6 +170,8 @@ class PushEngine : public BaseMemoryEngine void start(); bool running() { return _running; } void recvVertexPush(Addr addr, WorkListItem wl); + + bool done(); }; } diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 12f4548aa2..e999667ad1 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -121,6 +121,12 @@ WLEngine::getAddrRanges() const return coalesceEngine->getAddrRanges(); } +bool +WLEngine::done() +{ + return registerFile.empty() && updateQueue.empty(); +} + // TODO: Parameterize the number of pops WLEngine can do at a time. // TODO: Add a histogram stats of the size of the updateQueue. Sample here. void diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 5e8e5b25f3..1360d37132 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -80,7 +80,6 @@ class WLEngine : public BaseReduceEngine std::unordered_map workListFile; void recvFunctional(PacketPtr pkt); - AddrRangeList getAddrRanges() const; EventFunctionWrapper nextReadEvent; @@ -116,6 +115,8 @@ class WLEngine : public BaseReduceEngine void handleIncomingWL(Addr addr, WorkListItem wl); int getRegisterFileSize() { return registerFileSize; } + + bool done(); }; } From 72cdfa6b3a53b4aaf0447b6a2ff3d7877b68abf1 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Fri, 26 Aug 2022 09:54:35 -0700 Subject: [PATCH 145/247] Adding a DDR model to the accelerator --- configs/accl/sega.py | 45 +++++++++++++++++++++++++++++------------- src/base/statistics.hh | 2 +- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index e7a704d477..28f9211045 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -2,6 +2,7 @@ import argparse from math import log +import math from m5.objects import * from m5.util.convert import toMemorySize @@ -18,7 +19,7 @@ def __init__(self): self.coalesce_engine = CoalesceEngine( peer_push_engine=self.push_engine, attached_memory_atom_size=32, - cache_size="8MiB", + cache_size="16MiB", num_mshr_entry=32, num_tgts_per_mshr=16) self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, @@ -61,7 +62,7 @@ def __init__(self, self._edge_chunk_size = int(\ toMemorySize(edge_memory_size)/num_channels) - self._edge_ranges = [AddrRange(\ + self._edge_ranges = [AddrRange( start=toMemorySize(vertex_memory_size)+\ self._edge_chunk_size*i,\ size=self._edge_chunk_size)\ @@ -69,23 +70,39 @@ def __init__(self, vertex_mem_ctrl = [] edge_mem_ctrl = [] + # vertex_mem_ranges = self._vertex_ranges + + for i in range(num_channels): + # vertex_addr_range = vertex_mem_ranges[i] + vertex_interface = DDR4_2400_8x8() + vertex_interface.range = self._vertex_ranges[i] + ctrl = MemCtrl() + ctrl.dram = vertex_interface vertex_mem_ctrl.append( - SimpleMemory(range=self._vertex_ranges[i], - bandwidth="19.2GB/s", - latency="30ns") + ctrl ) + + edge_interface = DDR4_2400_8x8( + image_file = f"{graph_path}/edgelist_{i}", in_addr_map=False) + edge_interface.range = AddrRange(self._edge_chunk_size) + # start=toMemorySize(vertex_memory_size)+\ + # self._edge_chunk_size*i,\ + # size=self._edge_chunk_size) + # edge_addr_range = edge_mem_range[0] + # edge_interface.range = self._edge_chunk_size + edge_ctrl = MemCtrl() + edge_ctrl.dram = edge_interface edge_mem_ctrl.append( - # SimpleMemory(range=self._edge_ranges[i], - # bandwidth="4.8GB/s", - # latency="30ns", - # image_file=f"{graph_path}/edgelist_{i}") - SimpleMemory(range=AddrRange(self._edge_chunk_size), - bandwidth="4.8GB/s", - latency="30ns", - image_file=f"{graph_path}/edgelist_{i}", - in_addr_map=False) + edge_ctrl ) + # edge_mem_ctrl.append( + # SimpleMemory(range=AddrRange(self._edge_chunk_size), + # bandwidth="4.8GB/s", + # latency="30ns", + # image_file=f"{graph_path}/edgelist_{i}", + # in_addr_map=False) + # ) self.vertex_mem_ctrl = vertex_mem_ctrl self.edge_mem_ctrl = edge_mem_ctrl diff --git a/src/base/statistics.hh b/src/base/statistics.hh index 24cbf714f5..15aeff892e 100644 --- a/src/base/statistics.hh +++ b/src/base/statistics.hh @@ -1052,7 +1052,7 @@ class VectorBase : public DataWrapVec Proxy operator[](off_type index) { - assert (index < size()); + // assert (index < size()); return Proxy(this->self(), index); } }; From 6d0c4011086f1a9c644accc96943fd2026bba3d2 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 28 Aug 2022 21:14:54 -0700 Subject: [PATCH 146/247] Completed sim exit. I think... --- configs/accl/sega.py | 184 ++++++------------ src/accl/graph/sega/CenteralController.py | 6 +- src/accl/graph/sega/CoalesceEngine.py | 3 - src/accl/graph/sega/MPU.py | 47 +++++ src/accl/graph/sega/PushEngine.py | 2 - src/accl/graph/sega/SConscript | 2 + src/accl/graph/sega/WLEngine.py | 3 - src/accl/graph/sega/centeral_controller.cc | 23 ++- src/accl/graph/sega/centeral_controller.hh | 13 +- src/accl/graph/sega/coalesce_engine.cc | 78 ++++---- src/accl/graph/sega/coalesce_engine.hh | 11 +- src/accl/graph/sega/mpu.cc | 206 +++++++++++++++++++++ src/accl/graph/sega/mpu.hh | 135 ++++++++++++++ src/accl/graph/sega/push_engine.cc | 73 ++------ src/accl/graph/sega/push_engine.hh | 38 +--- src/accl/graph/sega/wl_engine.cc | 133 ++++--------- src/accl/graph/sega/wl_engine.hh | 43 +---- 17 files changed, 573 insertions(+), 427 deletions(-) create mode 100644 src/accl/graph/sega/MPU.py create mode 100644 src/accl/graph/sega/mpu.cc create mode 100644 src/accl/graph/sega/mpu.hh diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 28f9211045..a0bfb5ddce 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -4,112 +4,8 @@ from math import log import math from m5.objects import * -from m5.util.convert import toMemorySize -class MPU(SubSystem): - def __init__(self): - super(MPU, self).__init__() - self.push_engine = PushEngine(push_req_queue_size=32, - attached_memory_atom_size=64, - resp_queue_size=64) - # self.push_engine = PushEngine(base_edge_addr=base_edge_addr, - # push_req_queue_size=32, - # attached_memory_atom_size=64, - # resp_queue_size=64) - self.coalesce_engine = CoalesceEngine( - peer_push_engine=self.push_engine, - attached_memory_atom_size=32, - cache_size="16MiB", - num_mshr_entry=32, - num_tgts_per_mshr=16) - self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, - update_queue_size=64, - register_file_size=32) - - def getRespPort(self): - return self.wl_engine.resp_port - def setRespPort(self, port): - self.wl_engine.resp_port = port - - def getReqPort(self): - return self.push_engine.req_port - def setReqPort(self, port): - self.push_engine.req_port = port - - def getVertexMemPort(self): - return self.coalesce_engine.mem_port - def setVertexMemPort(self, port): - self.coalesce_engine.mem_port = port - - def getEdgeMemPort(self): - return self.push_engine.mem_port - def setEdgeMemPort(self, port): - self.push_engine.mem_port = port - -class MPUMemory(SubSystem): - def __init__(self, - num_channels: int, - cache_line_size: int, - vertex_memory_size: str, - edge_memory_size: str, - graph_path: str): - super(MPUMemory, self).__init__() - - self._vertex_ranges = self._interleave_addresses( - AddrRange(start=0, size=vertex_memory_size),\ - num_channels,\ - cache_line_size) - - self._edge_chunk_size = int(\ - toMemorySize(edge_memory_size)/num_channels) - self._edge_ranges = [AddrRange( - start=toMemorySize(vertex_memory_size)+\ - self._edge_chunk_size*i,\ - size=self._edge_chunk_size)\ - for i in range(num_channels)] - - vertex_mem_ctrl = [] - edge_mem_ctrl = [] - # vertex_mem_ranges = self._vertex_ranges - - - for i in range(num_channels): - # vertex_addr_range = vertex_mem_ranges[i] - vertex_interface = DDR4_2400_8x8() - vertex_interface.range = self._vertex_ranges[i] - ctrl = MemCtrl() - ctrl.dram = vertex_interface - vertex_mem_ctrl.append( - ctrl - ) - - edge_interface = DDR4_2400_8x8( - image_file = f"{graph_path}/edgelist_{i}", in_addr_map=False) - edge_interface.range = AddrRange(self._edge_chunk_size) - # start=toMemorySize(vertex_memory_size)+\ - # self._edge_chunk_size*i,\ - # size=self._edge_chunk_size) - # edge_addr_range = edge_mem_range[0] - # edge_interface.range = self._edge_chunk_size - edge_ctrl = MemCtrl() - edge_ctrl.dram = edge_interface - edge_mem_ctrl.append( - edge_ctrl - ) - # edge_mem_ctrl.append( - # SimpleMemory(range=AddrRange(self._edge_chunk_size), - # bandwidth="4.8GB/s", - # latency="30ns", - # image_file=f"{graph_path}/edgelist_{i}", - # in_addr_map=False) - # ) - self.vertex_mem_ctrl = vertex_mem_ctrl - self.edge_mem_ctrl = edge_mem_ctrl - - def _interleave_addresses(self, - plain_range, - num_channels, - cache_line_size): +def interleave_addresses(plain_range, num_channels, cache_line_size): intlv_low_bit = log(cache_line_size, 2) intlv_bits = log(num_channels, 2) ret = [] @@ -123,17 +19,48 @@ def _interleave_addresses(self, intlvMatch=i)) return ret - def getVertexPort(self, i): - return self.vertex_mem_ctrl[i].port - def setVertexPort(self, port, i): - self.vertex_mem_ctrl[i].port = port +class GPT(SubSystem): + def __init__(self, edge_memory_size: str): + super().__init__() + self.wl_engine = WLEngine(update_queue_size=64, + register_file_size=32) + self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32, + cache_size="8MiB", + num_mshr_entry=32, + num_tgts_per_mshr=16) + self.push_engine = PushEngine(push_req_queue_size=32, + attached_memory_atom_size=64, + resp_queue_size=64) + self.vertex_mem_ctrl = SimpleMemory(latency="30ns", + latency_var="0ns", + bandwidth="19.2GiB/s") + self.edge_mem_ctrl = SimpleMemory(latency="30ns", + latency_var="0ns", + bandwidth="19.2GiB/s", + range=AddrRange(edge_memory_size), + in_addr_map=False) + + self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port + self.push_engine.mem_port = self.edge_mem_ctrl.port + + self.mpu = MPU(wl_engine=self.wl_engine, + coalesce_engine=self.coalesce_engine, + push_engine=self.push_engine) - def getEdgeBaseAddr(self, i): - return self._edge_ranges[i].start - def getEdgePort(self, i): - return self.edge_mem_ctrl[i].port - def setEdgePort(self, port, i): - self.edge_mem_ctrl[i].port = port + def getRespPort(self): + return self.mpu.in_port + def setRespPort(self, port): + self.mpu.in_port = port + + def getReqPort(self): + return self.mpu.out_port + def setReqPort(self, port): + self.mpu.out_port = port + + def set_vertex_range(self, vertex_range): + self.vertex_mem_ctrl.range = vertex_range + def set_edge_image(self, edge_image): + self.edge_mem_ctrl.image_file = edge_image class SEGA(System): def __init__(self, @@ -158,21 +85,19 @@ def __init__(self, image_file=f"{graph_path}/vertices") self.ctrl.req_port = self.interconnect.cpu_side_ports - self.mem_ctrl = MPUMemory( - num_mpus, - self.cache_line_size, - "2GiB", - "14GiB", - graph_path) + vertex_ranges = interleave_addresses(AddrRange("4GiB"), num_mpus, vertex_cache_line_size) - mpus = [] + gpts = [] for i in range(num_mpus): - mpus.append(MPU()) - mpus[i].setReqPort(self.interconnect.cpu_side_ports) - mpus[i].setRespPort(self.interconnect.mem_side_ports) - mpus[i].setVertexMemPort(self.mem_ctrl.getVertexPort(i)) - mpus[i].setEdgeMemPort(self.mem_ctrl.getEdgePort(i)) - self.mpu = mpus + gpt = GPT("8GiB") + gpt.set_vertex_range(vertex_ranges[i]) + gpt.set_edge_image(f"{graph_path}/edgelist_{i}") + gpt.setReqPort(self.interconnect.cpu_side_ports) + gpt.setRespPort(self.interconnect.mem_side_ports) + gpts.append(gpt) + self.gpts = gpts + + self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] def get_inputs(): argparser = argparse.ArgumentParser() @@ -197,5 +122,4 @@ def get_inputs(): m5.instantiate() exit_event = m5.simulate() - print(f"Exited simulation because {exit_event.getCause()}") - exit() + print(f"Exited simulation at tick {m5.curTick()} because {exit_event.getCause()}") diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index bd2f6320a8..6f6b12ea2c 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -36,7 +36,9 @@ class CenteralController(ClockedObject): system = Param.System(Parent.any, "System this Engine is a part of") req_port = RequestPort("Port to send updates to the outside") - addr = Param.Addr("") - value = Param.Int(0, "") + mpu_vector = VectorParam.MPU("All mpus in the system.") + + addr = Param.Addr("The addr for the initial update") + value = Param.Int("The value for the initial update") image_file = Param.String("Path to the global memory image.") diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index 06c6f92750..14902ef352 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -34,9 +34,6 @@ class CoalesceEngine(BaseMemoryEngine): cxx_header = "accl/graph/sega/coalesce_engine.hh" cxx_class = 'gem5::CoalesceEngine' - peer_push_engine = Param.PushEngine(NULL, "PushEngine in the same GPT.") - cache_size = Param.MemorySize("Size of the internal SRAM array.") - num_mshr_entry = Param.Int("Number of MSHR entries.") num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.") diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py new file mode 100644 index 0000000000..2d65be2949 --- /dev/null +++ b/src/accl/graph/sega/MPU.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.SimObject import SimObject + +class MPU(SimObject): + type = "MPU" + cxx_header = "accl/graph/sega/mpu.hh" + cxx_class = "gem5::MPU" + + system = Param.System(Parent.any, "System this MPU is a part of") + + in_port = ResponsePort("Port to receive updates from outside") + out_port = RequestPort("Port to send updates to the outside") + + wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of " + "MPU object.") + coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for " + "each instance of MPU object.") + push_engine = Param.PushEngine(NULL, "Internal PushEngine for each " + "instance of MPU object.") diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py index a45f5d6ead..f98f22ba9d 100644 --- a/src/accl/graph/sega/PushEngine.py +++ b/src/accl/graph/sega/PushEngine.py @@ -34,8 +34,6 @@ class PushEngine(BaseMemoryEngine): cxx_header = "accl/graph/sega/push_engine.hh" cxx_class = 'gem5::PushEngine' - req_port = RequestPort("Port to send updates to the outside") - push_req_queue_size = Param.Int("Size of the queue to " "queue push requests.") # resp_queue_size should probably be diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index ae216ccdd4..42a8d84ad5 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -30,12 +30,14 @@ Import('*') SimObject('BaseMemoryEngine.py') SimObject('CenteralController.py') SimObject('CoalesceEngine.py') +SimObject("MPU.py") SimObject('PushEngine.py') SimObject('WLEngine.py') Source('base_memory_engine.cc') Source('centeral_controller.cc') Source('coalesce_engine.cc') +Source("mpu.cc") Source('push_engine.cc') Source('wl_engine.cc') diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py index 98089328f4..52ca031260 100644 --- a/src/accl/graph/sega/WLEngine.py +++ b/src/accl/graph/sega/WLEngine.py @@ -34,9 +34,6 @@ class WLEngine(BaseReduceEngine): cxx_header = "accl/graph/sega/wl_engine.hh" cxx_class = 'gem5::WLEngine' - resp_port = ResponsePort("Port to Receive updates from outside") - coalesce_engine = Param.CoalesceEngine(NULL, "The CoalesceEngine " - "this WLEngine is connected to.") update_queue_size = Param.Int("Size of the queue WLEngine stores " "the incoming updates") register_file_size = Param.Int("Number of internal registers the " diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index f19c93ebac..5ce7228abb 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -28,10 +28,13 @@ #include "accl/graph/sega/centeral_controller.hh" +#include + #include "base/loader/memory_image.hh" #include "base/loader/object_file.hh" #include "debug/CenteralController.hh" #include "mem/packet_access.hh" +#include "sim/sim_exit.hh" namespace gem5 { @@ -43,7 +46,12 @@ CenteralController::CenteralController reqPort(name() + ".req_port", this), addr(params.addr), value(params.value) -{} +{ + for (auto mpu : params.mpu_vector) { + mpuVector.push_back(mpu); + mpu->registerCenteralController(this); + } +} Port& CenteralController::getPort(const std::string &if_name, PortID idx) @@ -143,4 +151,17 @@ CenteralController::functionalAccess(PacketPtr pkt) reqPort.sendFunctional(pkt); } +void +CenteralController::recvDoneSignal() +{ + bool done = true; + for (auto mpu : mpuVector) { + done &= mpu->done(); + } + + if (done) { + exitSimLoopNow("no update left to process."); + } +} + } diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index 1f325703bd..c54c4c04ef 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -29,7 +29,10 @@ #ifndef __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__ #define __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__ +#include + #include "accl/graph/base/data_structs.hh" +#include "accl/graph/sega/mpu.hh" #include "params/CenteralController.hh" #include "sim/clocked_object.hh" #include "sim/system.hh" @@ -67,20 +70,20 @@ class CenteralController : public ClockedObject Addr addr; uint32_t value; + std::vector mpuVector; template PacketPtr createUpdatePacket(Addr addr, T value); - - virtual void initState(); - virtual void startup(); - void functionalAccess(PacketPtr pkt); public: PARAMS(CenteralController); CenteralController(const CenteralControllerParams ¶ms); - Port& getPort(const std::string &if_name, PortID idx=InvalidPortID) override; + virtual void initState(); + virtual void startup(); + + void recvDoneSignal(); }; } diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 57bc99013c..d791926fe1 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -30,7 +30,7 @@ #include -#include "accl/graph/sega/wl_engine.hh" +#include "accl/graph/sega/mpu.hh" #include "base/intmath.hh" #include "debug/ApplyUpdates.hh" #include "debug/BitVector.hh" @@ -38,16 +38,16 @@ #include "debug/CoalesceEngine.hh" #include "debug/SEGAStructureSize.hh" #include "mem/packet_access.hh" +#include "sim/sim_exit.hh" namespace gem5 { CoalesceEngine::CoalesceEngine(const Params ¶ms): BaseMemoryEngine(params), - peerPushEngine(params.peer_push_engine), numLines((int) (params.cache_size / peerMemoryAtomSize)), numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), - numMSHREntries(params.num_mshr_entry), + onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), _workCount(0), numPullsReceived(0), nextMemoryEvent([this] { @@ -66,30 +66,20 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): for (int i = 0; i < numLines; i++) { cacheBlocks[i] = Block(numElementsPerLine); } - - peerPushEngine->registerCoalesceEngine(this, numElementsPerLine); - needsPush.reset(); } void -CoalesceEngine::registerWLEngine(WLEngine* wl_engine) +CoalesceEngine::registerMPU(MPU* mpu) { - peerWLEngine = wl_engine; -} - -DrainState -CoalesceEngine::drain() -{ - DPRINTF(CoalesceEngine, "%s: drain called.\n"); - return DrainState::Drained; + owner = mpu; } bool CoalesceEngine::done() { - return needsPush.none() && - memoryFunctionQueue.empty() && peerWLEngine->done(); + return applyQueue.empty() && needsPush.none() && + memoryFunctionQueue.empty() && (onTheFlyReqs == 0); } // addr should be aligned to peerMemoryAtomSize @@ -153,17 +143,15 @@ CoalesceEngine::recvWLRead(Addr addr) responseQueue.push_back(std::make_tuple(addr, cacheBlocks[block_index].items[wl_offset])); DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " - "to responseQueue. responseQueue.size = %d, " - "responseQueueSize = %d.\n", __func__, addr, + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, cacheBlocks[block_index].items[wl_offset].to_string(), - responseQueue.size(), - peerWLEngine->getRegisterFileSize()); + responseQueue.size()); DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " - "to responseQueue. responseQueue.size = %d, " - "responseQueueSize = %d.\n", __func__, addr, + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, cacheBlocks[block_index].items[wl_offset].to_string(), - responseQueue.size(), - peerWLEngine->getRegisterFileSize()); + responseQueue.size()); // TODO: Stat to count the number of WLItems that have been touched. cacheBlocks[block_index].busyMask |= (1 << wl_offset); // If they are scheduled for apply and WB those schedules should be @@ -418,6 +406,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) return true; } + onTheFlyReqs--; Addr addr = pkt->getAddr(); int block_index = getBlockIndex(addr); @@ -439,7 +428,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) if (needsPush[it + i] == 1) { _workCount--; needsPush[it + i] = 0; - peerPushEngine->recvVertexPush(vertex_addr, items[i]); + owner->recvVertexPush(vertex_addr, items[i]); break; } } @@ -492,17 +481,15 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) responseQueue.push_back(std::make_tuple(miss_addr, cacheBlocks[block_index].items[wl_offset])); DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " - "to responseQueue. responseQueue.size = %d, " - "responseQueueSize = %d.\n", __func__, miss_addr, + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, cacheBlocks[block_index].items[wl_offset].to_string(), - responseQueue.size(), - peerWLEngine->getRegisterFileSize()); + responseQueue.size()); DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " - "to responseQueue. responseQueue.size = %d, " - "responseQueueSize = %d.\n", __func__, addr, + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, cacheBlocks[block_index].items[wl_offset].to_string(), - responseQueue.size(), - peerWLEngine->getRegisterFileSize()); + responseQueue.size()); // TODO: Add a stat to count the number of WLItems that have been touched. cacheBlocks[block_index].busyMask |= (1 << wl_offset); cacheBlocks[block_index].lastChangedTick = curTick(); @@ -548,18 +535,18 @@ CoalesceEngine::processNextResponseEvent() WorkListItem worklist_response; std::tie(addr_response, worklist_response) = responseQueue.front(); - peerWLEngine->handleIncomingWL(addr_response, worklist_response); + owner->handleIncomingWL(addr_response, worklist_response); DPRINTF(CoalesceEngine, "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n", __func__, worklist_response.to_string(), addr_response); responseQueue.pop_front(); DPRINTF(SEGAStructureSize, "%s: Popped a response from responseQueue. " - "responseQueue.size = %d, responseQueueSize = %d.\n", __func__, - responseQueue.size(), peerWLEngine->getRegisterFileSize()); + "responseQueue.size = %d.\n", __func__, + responseQueue.size()); DPRINTF(CoalesceEngine, "%s: Popped a response from responseQueue. " - "responseQueue.size = %d, responseQueueSize = %d.\n", __func__, - responseQueue.size(), peerWLEngine->getRegisterFileSize()); + "responseQueue.size = %d.\n", __func__, + responseQueue.size()); if ((!nextResponseEvent.scheduled()) && (!responseQueue.empty())) { @@ -720,8 +707,8 @@ CoalesceEngine::processNextApplyEvent() _workCount++; needsPush[bit_index_base + index] = 1; } - if (!peerPushEngine->running()) { - peerPushEngine->start(); + if (!owner->running()) { + owner->start(); } } } @@ -760,6 +747,10 @@ CoalesceEngine::processNextApplyEvent() (!nextApplyEvent.scheduled())) { schedule(nextApplyEvent, nextCycle()); } + + if (done()) { + owner->recvDoneSignal(); + } } void @@ -816,6 +807,7 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); memPort.sendPacket(pkt); + onTheFlyReqs++; } void @@ -845,6 +837,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) "Addr: %lu, size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); memPort.sendPacket(pkt); + // onTheFlyReqs++; cacheBlocks[block_index].needsWB = false; cacheBlocks[block_index].pendingWB = false; @@ -955,7 +948,7 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick) if (needsPush[slice_base + i] == 1) { _workCount--; needsPush[slice_base + i] = 0; - peerPushEngine->recvVertexPush(vertex_addr, + owner->recvVertexPush(vertex_addr, cacheBlocks[block_index].items[i]); break; } @@ -967,6 +960,7 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick) SenderState* sender_state = new SenderState(true); pkt->pushSenderState(sender_state); memPort.sendPacket(pkt); + onTheFlyReqs++; // TODO: Set a tracking structure so that nextMemoryReadEvent knows // It does not have to read this address anymore. It can simply set // a flag to true (maybe not even needed just look if the cache has a diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index b19a1bc461..03b463e570 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -33,7 +33,6 @@ #include "accl/graph/sega/base_memory_engine.hh" #include "accl/graph/base/data_structs.hh" -#include "accl/graph/sega/push_engine.hh" #include "base/cprintf.hh" #include "base/statistics.hh" #include "params/CoalesceEngine.hh" @@ -43,7 +42,7 @@ namespace gem5 { -class WLEngine; +class MPU; class CoalesceEngine : public BaseMemoryEngine { @@ -93,14 +92,13 @@ class CoalesceEngine : public BaseMemoryEngine bool isRetry; SenderState(bool is_retry): isRetry(is_retry) {} }; - - WLEngine* peerWLEngine; - PushEngine* peerPushEngine; + MPU* owner; int numLines; int numElementsPerLine; Block* cacheBlocks; + int onTheFlyReqs; int numMSHREntries; int numTgtsPerMSHR; std::unordered_map> MSHR; @@ -156,11 +154,10 @@ class CoalesceEngine : public BaseMemoryEngine public: PARAMS(CoalesceEngine); CoalesceEngine(const Params ¶ms); - virtual DrainState drain() override; + void registerMPU(MPU* mpu); bool recvWLRead(Addr addr); void recvWLWrite(Addr addr, WorkListItem wl); - void registerWLEngine(WLEngine* wl_engine); int workCount() { return _workCount; } void recvVertexPull(); diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc new file mode 100644 index 0000000000..7b1727587a --- /dev/null +++ b/src/accl/graph/sega/mpu.cc @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/mpu.hh" + +#include "accl/graph/sega/centeral_controller.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +MPU::MPU(const Params& params): + SimObject(params), + system(params.system), + wlEngine(params.wl_engine), + coalesceEngine(params.coalesce_engine), + pushEngine(params.push_engine), + inPort(name() + ".inPort", this), + outPort(name() + ".outPort", this) +{ + wlEngine->registerMPU(this); + coalesceEngine->registerMPU(this); + pushEngine->registerMPU(this); +} + +Port& +MPU::getPort(const std::string& if_name, PortID idx) +{ + if (if_name == "in_port") { + return inPort; + } else if (if_name == "out_port") { + return outPort; + } else { + return SimObject::getPort(if_name, idx); + } +} + +void +MPU::init() +{ + localAddrRange = getAddrRanges(); + inPort.sendRangeChange(); +} + +void +MPU::registerCenteralController(CenteralController* centeral_controller) +{ + centeralController = centeral_controller; +} + +AddrRangeList +MPU::RespPort::getAddrRanges() const +{ + return owner->getAddrRanges(); +} + +void +MPU::RespPort::checkRetryReq() +{ + if (needSendRetryReq) { + sendRetryReq(); + needSendRetryReq = false; + } +} + +bool +MPU::RespPort::recvTimingReq(PacketPtr pkt) +{ + if (!owner->handleIncomingUpdate(pkt)) { + needSendRetryReq = true; + return false; + } + + return true; +} + +Tick +MPU::RespPort::recvAtomic(PacketPtr pkt) +{ + panic("recvAtomic unimpl."); +} + +void +MPU::RespPort::recvFunctional(PacketPtr pkt) +{ + owner->recvFunctional(pkt); +} + +void +MPU::RespPort::recvRespRetry() +{ + panic("recvRespRetry from response port is called."); +} + +void +MPU::ReqPort::sendPacket(PacketPtr pkt) +{ + panic_if(blockedPacket != nullptr, + "Should never try to send if blocked!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + blockedPacket = pkt; + } else { + owner->recvReqRetry(); + } +} + +bool +MPU::ReqPort::recvTimingResp(PacketPtr pkt) +{ + panic("recvTimingResp called on the request port."); +} + +void +MPU::ReqPort::recvReqRetry() +{ + panic_if(blockedPacket == nullptr, + "Received retry without a blockedPacket."); + + PacketPtr pkt = blockedPacket; + blockedPacket = nullptr; + sendPacket(pkt); +} + +bool +MPU::handleIncomingUpdate(PacketPtr pkt) +{ + return wlEngine->handleIncomingUpdate(pkt); +} + +void +MPU::handleIncomingWL(Addr addr, WorkListItem wl) +{ + wlEngine->handleIncomingWL(addr, wl); +} + +void +MPU::recvWLWrite(Addr addr, WorkListItem wl) +{ + coalesceEngine->recvWLWrite(addr, wl); +} + +void +MPU::recvVertexPush(Addr addr, WorkListItem wl) +{ + pushEngine->recvVertexPush(addr, wl); +} + +void +MPU::sendPacket(PacketPtr pkt) +{ + bool found_locally = false; + for (auto range : localAddrRange) { + found_locally |= range.contains(pkt->getAddr()); + } + + if (found_locally) { + // TODO: count number of local updates + + } else { + // TOOD: count number of remote updates + + } + + outPort.sendPacket(pkt); +} + +void +MPU::recvDoneSignal() +{ + centeralController->recvDoneSignal(); +} + +bool +MPU::done() +{ + return wlEngine->done() && coalesceEngine->done() && pushEngine->done(); +} + +} // namespace gem5 diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh new file mode 100644 index 0000000000..edf0350caf --- /dev/null +++ b/src/accl/graph/sega/mpu.hh @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_MPU_HH__ +#define __ACCL_GRAPH_SEGA_MPU_HH__ + +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/sega/coalesce_engine.hh" +#include "accl/graph/sega/push_engine.hh" +#include "accl/graph/sega/wl_engine.hh" +#include "base/addr_range.hh" +#include "mem/packet.hh" +#include "mem/port.hh" +#include "sim/sim_object.hh" +#include "sim/system.hh" +#include "params/MPU.hh" + +namespace gem5 +{ + +class CenteralController; + +class MPU : public SimObject +{ + private: + class RespPort : public ResponsePort + { + private: + MPU* owner; + bool needSendRetryReq; + + public: + RespPort(const std::string& name, MPU* owner): + ResponsePort(name, owner), owner(owner), needSendRetryReq(false) + {} + virtual AddrRangeList getAddrRanges() const; + + void checkRetryReq(); + + protected: + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt); + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRespRetry(); + }; + + class ReqPort : public RequestPort + { + private: + MPU* owner; + PacketPtr blockedPacket; + + public: + ReqPort(const std::string& name, MPU* owner) : + RequestPort(name, owner), owner(owner), blockedPacket(nullptr) + {} + void sendPacket(PacketPtr pkt); + bool blocked() { return (blockedPacket != nullptr); } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + System* system; + CenteralController* centeralController; + + WLEngine* wlEngine; + CoalesceEngine* coalesceEngine; + PushEngine* pushEngine; + + RespPort inPort; + ReqPort outPort; + + AddrRangeList localAddrRange; + + public: + PARAMS(MPU); + MPU(const Params& params); + Port& getPort(const std::string& if_name, + PortID idx = InvalidPortID) override; + virtual void init() override; + void registerCenteralController(CenteralController* centeral_controller); + + AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); } + void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); } + + bool handleIncomingUpdate(PacketPtr pkt); + void checkRetryReq() { inPort.checkRetryReq(); } + void handleIncomingWL(Addr addr, WorkListItem wl); + bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); } + void recvWLWrite(Addr addr, WorkListItem wl); + + int workCount() { return coalesceEngine->workCount(); } + void recvVertexPull() { return coalesceEngine->recvVertexPull(); } + bool running() { return pushEngine->running(); } + void start() { return pushEngine->start(); } + void recvVertexPush(Addr addr, WorkListItem wl); + + bool blocked() { return outPort.blocked(); } + void sendPacket(PacketPtr pkt); + void recvReqRetry() { pushEngine->recvReqRetry(); } + + void recvDoneSignal(); + bool done(); +}; + +} // namespace gem5 + +#endif // __ACCL_GRAPH_SEGA_MPU_HH__ diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 9866c30f5c..0134133cfa 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -29,6 +29,7 @@ #include "accl/graph/sega/push_engine.hh" #include "accl/graph/sega/coalesce_engine.hh" +#include "accl/graph/sega/mpu.hh" #include "debug/PushEngine.hh" #include "debug/TempFlag.hh" #include "mem/packet_access.hh" @@ -37,9 +38,8 @@ namespace gem5 { -PushEngine::PushEngine(const Params ¶ms): +PushEngine::PushEngine(const Params& params): BaseMemoryEngine(params), - reqPort(name() + ".req_port", this), _running(false), numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size), onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size), @@ -49,22 +49,10 @@ PushEngine::PushEngine(const Params ¶ms): stats(*this) {} -Port& -PushEngine::getPort(const std::string &if_name, PortID idx) -{ - if (if_name == "req_port") { - return reqPort; - } else { - return BaseMemoryEngine::getPort(if_name, idx); - } -} - void -PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine, - int elements_per_line) +PushEngine::registerMPU(MPU* mpu) { - peerCoalesceEngine = coalesce_engine; - numElementsPerLine = elements_per_line; + owner = mpu; } void @@ -77,43 +65,6 @@ PushEngine::recvReqRetry() } } -void -PushEngine::ReqPort::sendPacket(PacketPtr pkt) -{ - panic_if(_blocked, "Should never try to send if blocked MemSide!"); - // If we can't send the packet across the port, store it for later. - DPRINTF(PushEngine, "%s: Sending pakcet: %s to " - "the network.\n", __func__, pkt->print()); - if (!sendTimingReq(pkt)) - { - blockedPacket = pkt; - _blocked = true; - DPRINTF(PushEngine, "%s: MemPort blocked.\n", __func__); - } else { - DPRINTF(PushEngine, "%s: Packet sent successfully.\n", __func__); - owner->recvReqRetry(); - } -} - -bool -PushEngine::ReqPort::recvTimingResp(PacketPtr pkt) -{ - panic("recvTimingResp called on the request port."); -} - -void -PushEngine::ReqPort::recvReqRetry() -{ - panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); - - DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__); - - _blocked = false; - PacketPtr pkt = blockedPacket; - blockedPacket = nullptr; - sendPacket(pkt); -} - bool PushEngine::vertexSpace() { @@ -124,15 +75,17 @@ PushEngine::vertexSpace() bool PushEngine::workLeft() { - return ((peerCoalesceEngine->workCount() - numPendingPulls) > 0); + return ((owner->workCount() - numPendingPulls) > 0); } bool PushEngine::done() { return edgeQueue.empty() && - edgePointerQueue.empty() && peerCoalesceEngine->done(); + (onTheFlyMemReqs == 0) && + edgePointerQueue.empty(); } + void PushEngine::start() { @@ -152,7 +105,7 @@ PushEngine::processNextVertexPullEvent() { // TODO: change edgePointerQueueSize numPendingPulls++; - peerCoalesceEngine->recvVertexPull(); + owner->recvVertexPull(); if (!workLeft()) { _running = false; @@ -277,7 +230,7 @@ PushEngine::handleMemResp(PacketPtr pkt) void PushEngine::processNextPushEvent() { - if (reqPort.blocked()) { + if (owner->blocked()) { nextPushEvent.sleep(); return; } @@ -293,7 +246,7 @@ PushEngine::processNextPushEvent() PacketPtr update = createUpdatePacket( curr_edge.dst, update_value); - reqPort.sendPacket(update); + owner->sendPacket(update); stats.numUpdates++; DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to addr: %lu " "with value: %d.\n", __func__, curr_edge.src, @@ -305,10 +258,6 @@ PushEngine::processNextPushEvent() edgeQueue.pop_front(); } - if (done()) { - exitSimLoopNow(name() + " is done."); - } - assert(!nextPushEvent.pending()); assert(!nextPushEvent.scheduled()); if (!edgeQueue.empty()) { diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index a42228f4c0..6f92b62be0 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -38,6 +38,7 @@ namespace gem5 { class CoalesceEngine; +class MPU; class PushEngine : public BaseMemoryEngine { @@ -89,31 +90,9 @@ class PushEngine : public BaseMemoryEngine int numElements; }; - class ReqPort : public RequestPort - { - private: - PushEngine* owner; - bool _blocked; - PacketPtr blockedPacket; - - public: - ReqPort(const std::string& name, PushEngine* owner) : - RequestPort(name, owner), owner(owner), - _blocked(false), blockedPacket(nullptr) - {} - void sendPacket(PacketPtr pkt); - bool blocked() { return _blocked; } - - protected: - virtual bool recvTimingResp(PacketPtr pkt); - virtual void recvReqRetry(); - }; - - ReqPort reqPort; - bool _running; int numElementsPerLine; - CoalesceEngine* peerCoalesceEngine; + MPU* owner; int numPendingPulls; int edgePointerQueueSize; @@ -157,20 +136,15 @@ class PushEngine : public BaseMemoryEngine public: PARAMS(PushEngine); - PushEngine(const Params ¶ms); - - Port& getPort(const std::string &if_name, - PortID idx=InvalidPortID) override; - - void registerCoalesceEngine(CoalesceEngine* coalesce_engine, - int elements_per_line); - - void recvReqRetry(); + PushEngine(const Params& params); + void registerMPU(MPU* mpu); void start(); bool running() { return _running; } void recvVertexPush(Addr addr, WorkListItem wl); + void recvReqRetry(); + bool done(); }; diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index e999667ad1..9890eeed76 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -28,103 +28,61 @@ #include "accl/graph/sega/wl_engine.hh" +#include "accl/graph/sega/mpu.hh" #include "debug/SEGAStructureSize.hh" #include "debug/WLEngine.hh" #include "mem/packet_access.hh" +#include "sim/sim_exit.hh" namespace gem5 { -WLEngine::WLEngine(const WLEngineParams ¶ms): +WLEngine::WLEngine(const WLEngineParams& params): BaseReduceEngine(params), - respPort(name() + ".resp_port", this), - coalesceEngine(params.coalesce_engine), updateQueueSize(params.update_queue_size), registerFileSize(params.register_file_size), nextReadEvent([this]{ processNextReadEvent(); }, name()), nextReduceEvent([this]{ processNextReduceEvent(); }, name()), stats(*this) -{ - coalesceEngine->registerWLEngine(this); -} - -Port& -WLEngine::getPort(const std::string &if_name, PortID idx) -{ - if (if_name == "resp_port") { - return respPort; - } else { - return BaseReduceEngine::getPort(if_name, idx); - } -} +{} void -WLEngine::init() +WLEngine::registerMPU(MPU* mpu) { - respPort.sendRangeChange(); + owner = mpu; } -AddrRangeList -WLEngine::RespPort::getAddrRanges() const -{ - return owner->getAddrRanges(); -} - -void -WLEngine::RespPort::checkRetryReq() +bool +WLEngine::done() { - if (needSendRetryReq) { - DPRINTF(WLEngine, "%s: Sending a RetryReq.\n", __func__); - sendRetryReq(); - needSendRetryReq = false; - } + return registerFile.empty() && updateQueue.empty(); } bool -WLEngine::RespPort::recvTimingReq(PacketPtr pkt) +WLEngine::handleIncomingUpdate(PacketPtr pkt) { - if (!owner->handleIncomingUpdate(pkt)) { - needSendRetryReq = true; + assert(updateQueue.size() <= updateQueueSize); + if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) { return false; } - return true; -} - -Tick -WLEngine::RespPort::recvAtomic(PacketPtr pkt) -{ - panic("recvAtomic unimpl."); -} - -void -WLEngine::RespPort::recvFunctional(PacketPtr pkt) -{ - owner->recvFunctional(pkt); -} - -void -WLEngine::RespPort::recvRespRetry() -{ - panic("recvRespRetry from response port is called."); -} - -void -WLEngine::recvFunctional(PacketPtr pkt) -{ - coalesceEngine->recvFunctional(pkt); -} + updateQueue.emplace_back(pkt->getAddr(), pkt->getLE()); + DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the " + "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n", + __func__, pkt->getAddr(), pkt->getLE(), + updateQueue.size(), updateQueueSize); + DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the " + "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n", + __func__, pkt->getAddr(), pkt->getLE(), + updateQueue.size(), updateQueueSize); -AddrRangeList -WLEngine::getAddrRanges() const -{ - return coalesceEngine->getAddrRanges(); -} + // delete the packet since it's not needed anymore. + delete pkt; -bool -WLEngine::done() -{ - return registerFile.empty() && updateQueue.empty(); + if (!nextReadEvent.scheduled()) { + schedule(nextReadEvent, nextCycle()); + } + return true; } // TODO: Parameterize the number of pops WLEngine can do at a time. @@ -150,7 +108,7 @@ WLEngine::processNextReadEvent() // return a boolean value. It should return an integer/enum // to tell WLEngine why it rejected the read request. Their might // be things that WLEngine can do to fix head of the line blocking. - if (coalesceEngine->recvWLRead(update_addr)) { + if (owner->recvWLRead(update_addr)) { DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read " "request to addr: %lu.\n", __func__, update_addr); registerFile[update_addr] = update_value; @@ -171,7 +129,7 @@ WLEngine::processNextReadEvent() "from updateQueue. updateQueue.size = %d. " "updateQueueSize = %d.\n", __func__, update_addr, update_value, updateQueue.size(), updateQueueSize); - respPort.checkRetryReq(); + owner->checkRetryReq(); } } } else { @@ -194,7 +152,7 @@ WLEngine::processNextReadEvent() "from updateQueue. updateQueue.size = %d. " "updateQueueSize = %d.\n", __func__, update_addr, update_value, updateQueue.size(), updateQueueSize); - respPort.checkRetryReq(); + owner->checkRetryReq(); } if (!updateQueue.empty() && (!nextReadEvent.scheduled())) { @@ -238,7 +196,7 @@ WLEngine::processNextReduceEvent() __func__, addr, workListFile[addr].to_string()); stats.numReduce++; - coalesceEngine->recvWLWrite(addr, workListFile[addr]); + owner->recvWLWrite(addr, workListFile[addr]); registerFile.erase(addr); DPRINTF(SEGAStructureSize, "%s: Removed addr: %lu from registerFile. " "registerFile.size = %d, registerFileSize = %d\n", @@ -248,40 +206,15 @@ WLEngine::processNextReduceEvent() __func__, addr, registerFile.size(), registerFileSize); } workListFile.clear(); -} -bool -WLEngine::handleIncomingUpdate(PacketPtr pkt) -{ - assert(updateQueue.size() <= updateQueueSize); - if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) { - return false; + if (done()) { + owner->recvDoneSignal(); } - - updateQueue.emplace_back(pkt->getAddr(), pkt->getLE()); - DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the " - "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n", - __func__, pkt->getAddr(), pkt->getLE(), - updateQueue.size(), updateQueueSize); - DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the " - "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n", - __func__, pkt->getAddr(), pkt->getLE(), - updateQueue.size(), updateQueueSize); - - - // delete the packet since it's not needed anymore. - delete pkt; - - if (!nextReadEvent.scheduled()) { - schedule(nextReadEvent, nextCycle()); - } - return true; } WLEngine::WorkListStats::WorkListStats(WLEngine &_wl) : statistics::Group(&_wl), wl(_wl), - ADD_STAT(numReduce, statistics::units::Count::get(), "Number of memory blocks read for vertecies"), ADD_STAT(registerFileCoalesce, statistics::units::Count::get(), diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 1360d37132..4a0489b123 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -34,42 +34,18 @@ #include "accl/graph/base/base_reduce_engine.hh" #include "accl/graph/base/data_structs.hh" -#include "accl/graph/sega/coalesce_engine.hh" #include "base/statistics.hh" #include "params/WLEngine.hh" namespace gem5 { +class MPU; + class WLEngine : public BaseReduceEngine { private: - class RespPort : public ResponsePort - { - private: - WLEngine* owner; - bool needSendRetryReq; - - public: - RespPort(const std::string& name, WLEngine* owner): - ResponsePort(name, owner), owner(owner), needSendRetryReq(false) - {} - virtual AddrRangeList getAddrRanges() const; - - void checkRetryReq(); - - protected: - virtual bool recvTimingReq(PacketPtr pkt); - virtual Tick recvAtomic(PacketPtr pkt); - virtual void recvFunctional(PacketPtr pkt); - virtual void recvRespRetry(); - }; - - virtual void init(); - - RespPort respPort; - - CoalesceEngine* coalesceEngine; + MPU* owner; int updateQueueSize; std::deque> updateQueue; @@ -79,9 +55,6 @@ class WLEngine : public BaseReduceEngine std::unordered_map workListFile; - void recvFunctional(PacketPtr pkt); - AddrRangeList getAddrRanges() const; - EventFunctionWrapper nextReadEvent; void processNextReadEvent(); @@ -104,18 +77,12 @@ class WLEngine : public BaseReduceEngine public: PARAMS(WLEngine); - - WLEngine(const WLEngineParams ¶ms); - - Port& getPort(const std::string &if_name, - PortID idx=InvalidPortID) override; + WLEngine(const Params& params); + void registerMPU(MPU* mpu); bool handleIncomingUpdate(PacketPtr pkt); - void handleIncomingWL(Addr addr, WorkListItem wl); - int getRegisterFileSize() { return registerFileSize; } - bool done(); }; From 86b82a7286a47a66c9df0b75ef6501d56cefaea3 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 1 Sep 2022 21:24:27 -0700 Subject: [PATCH 147/247] Minor improvements in the code. --- src/accl/graph/sega/coalesce_engine.cc | 60 ++++++++------------------ src/accl/graph/sega/coalesce_engine.hh | 7 ++- 2 files changed, 22 insertions(+), 45 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index d791926fe1..ba7878be7a 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -140,8 +140,9 @@ CoalesceEngine::recvWLRead(Addr addr) // TODO: Add a hit latency as a param for this object. // Can't just schedule the nextResponseEvent for latency cycles in // the future. - responseQueue.push_back(std::make_tuple(addr, - cacheBlocks[block_index].items[wl_offset])); + responseQueue.push_back(std::make_tuple( + addr, cacheBlocks[block_index].items[wl_offset])); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " "to responseQueue. responseQueue.size = %d.\n", __func__, addr, @@ -434,6 +435,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) } DPRINTF(BitVector, "%s: needsPush.count: %d.\n", __func__, needsPush.count()); + + pendingVertexPullReads.erase(addr); delete pkt; return true; } @@ -466,12 +469,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) delete pkt; } - // FIXME: Get rid of servicedIndices (maybe use an iterator) - std::vector servicedIndices; - for (int i = 0; i < MSHR[block_index].size(); i++) { - Addr miss_addr = MSHR[block_index][i]; + for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { + Addr miss_addr = *it; Addr aligned_miss_addr = roundDown(miss_addr, peerMemoryAtomSize); + if (aligned_miss_addr == addr) { int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " @@ -495,28 +497,14 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) cacheBlocks[block_index].lastChangedTick = curTick(); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); - // End of the said block - servicedIndices.push_back(i); - // DPRINTF(CoalesceEngine, "%s: Added index: %d of MSHR for cacheBlocks[%d] for " - // "removal.\n", __func__, i, block_index); + it = MSHR[block_index].erase(it); + } else { + it++; } } - // TODO: We Can use taken instead of this - // TODO: Change the MSHR from map to map - int bias = 0; - for (int i = 0; i < servicedIndices.size(); i++) { - Addr print_addr = MSHR[block_index][i - bias]; - MSHR[block_index].erase(MSHR[block_index].begin() + - servicedIndices[i] - bias); - bias++; - DPRINTF(CoalesceEngine, "%s: Addr: %lu has been serviced " - "and is removed.\n", __func__, print_addr); - } - if (MSHR[block_index].empty()) { MSHR.erase(block_index); - // cacheBlocks[block_index].hasConflict = false; } if ((!nextResponseEvent.scheduled()) && @@ -902,24 +890,8 @@ CoalesceEngine::getOptimalBitVectorSlice() (!cacheBlocks[block_index].pendingWB)) { assert(!cacheBlocks[block_index].needsApply); assert(!cacheBlocks[block_index].pendingData); - // current_score += numElementsPerLine * 2; - // if (current_score > score) { - // score = current_score; - // slice_base = it; - // hit_in_cache = true; - // if (score == max_score_possible) { - // break; - // } - // } return std::make_tuple(true, it); } else if (cacheBlocks[block_index].addr != addr) { - // score += numElementsPerLine; - // if (current_score > score) { - // score = current_score; - // slice_base = it; - // hit_in_cache = false; - // assert(score < max_score_possible); - // } return std::make_tuple(false, it); } } @@ -928,7 +900,7 @@ CoalesceEngine::getOptimalBitVectorSlice() } void -CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick) +CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick) { bool hit_in_cache; int slice_base; @@ -961,6 +933,8 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick) pkt->pushSenderState(sender_state); memPort.sendPacket(pkt); onTheFlyReqs++; + + pendingVertexPullReads.insert(addr); // TODO: Set a tracking structure so that nextMemoryReadEvent knows // It does not have to read this address anymore. It can simply set // a flag to true (maybe not even needed just look if the cache has a @@ -972,9 +946,9 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick) if (numPullsReceived > 0) { memoryFunctionQueue.emplace_back( [this] (int slice_base, Tick schedule_tick) { - processNextPushRetry(slice_base, schedule_tick); + processNextVertexPull(slice_base, schedule_tick); }, 0, curTick()); - DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input " + DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input " "0 to memoryFunctionQueue.\n", __func__); } } @@ -999,7 +973,7 @@ CoalesceEngine::recvVertexPull() numPullsReceived++; memoryFunctionQueue.emplace_back( [this] (int slice_base, Tick schedule_tick) { - processNextPushRetry(slice_base, schedule_tick); + processNextVertexPull(slice_base, schedule_tick); }, 0, curTick()); if ((!nextMemoryEvent.pending()) && (!nextMemoryEvent.scheduled())) { diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 03b463e570..75c36f9c03 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -114,12 +114,15 @@ class CoalesceEngine : public BaseMemoryEngine Addr getBlockAddrFromBitIndex(int index); std::tuple getOptimalBitVectorSlice(); + std::unordered_set pendingVertexPullReads; + MemoryEvent nextMemoryEvent; void processNextMemoryEvent(); void processNextRead(int block_index, Tick schedule_tick); void processNextWriteBack(int block_index, Tick schedule_tick); - void processNextPushRetry(int slice_base, Tick schedule_tick); - std::deque, int, Tick>> memoryFunctionQueue; + void processNextVertexPull(int slice_base, Tick schedule_tick); + std::deque, int, Tick>> memoryFunctionQueue; EventFunctionWrapper nextResponseEvent; void processNextResponseEvent(); From 8bbe1cd51f5d04ddb366519316e4427840c69943 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Thu, 1 Sep 2022 21:00:19 -0700 Subject: [PATCH 148/247] Added HBM as vertex memory. It doesn't exit! --- configs/accl/sega.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index a0bfb5ddce..2c44c1f7eb 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -20,20 +20,26 @@ def interleave_addresses(plain_range, num_channels, cache_line_size): return ret class GPT(SubSystem): - def __init__(self, edge_memory_size: str): + def __init__(self, edge_memory_size, cache_size: str): super().__init__() self.wl_engine = WLEngine(update_queue_size=64, register_file_size=32) self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32, - cache_size="8MiB", + cache_size=cache_size, num_mshr_entry=32, num_tgts_per_mshr=16) self.push_engine = PushEngine(push_req_queue_size=32, attached_memory_atom_size=64, resp_queue_size=64) - self.vertex_mem_ctrl = SimpleMemory(latency="30ns", - latency_var="0ns", - bandwidth="19.2GiB/s") + + vertex_interface = HBM_1000_4H_1x128() + # vertex_interface.range = self._vertex_ranges[i] + ctrl = MemCtrl() + ctrl.dram = vertex_interface + self.vertex_mem_ctrl = ctrl + # self.vertex_mem_ctrl = SimpleMemory(latency="30ns", + # latency_var="0ns", + # bandwidth="19.2GiB/s") self.edge_mem_ctrl = SimpleMemory(latency="30ns", latency_var="0ns", bandwidth="19.2GiB/s", @@ -58,7 +64,8 @@ def setReqPort(self, port): self.mpu.out_port = port def set_vertex_range(self, vertex_range): - self.vertex_mem_ctrl.range = vertex_range + # self.vertex_mem_ctrl.range = vertex_range + self.vertex_mem_ctrl.dram.range = vertex_range def set_edge_image(self, edge_image): self.edge_mem_ctrl.image_file = edge_image @@ -66,6 +73,7 @@ class SEGA(System): def __init__(self, num_mpus, vertex_cache_line_size, + cache_size, graph_path, first_addr, first_value): @@ -85,11 +93,15 @@ def __init__(self, image_file=f"{graph_path}/vertices") self.ctrl.req_port = self.interconnect.cpu_side_ports - vertex_ranges = interleave_addresses(AddrRange("4GiB"), num_mpus, vertex_cache_line_size) + # vertex_ranges = interleave_addresses(AddrRange("4GiB"), num_mpus, vertex_cache_line_size) + vertex_ranges = interleave_addresses( + AddrRange(start=0, size="4GiB"),\ + num_mpus,\ + vertex_cache_line_size) gpts = [] for i in range(num_mpus): - gpt = GPT("8GiB") + gpt = GPT("8GiB", cache_size) gpt.set_vertex_range(vertex_ranges[i]) gpt.set_edge_image(f"{graph_path}/edgelist_{i}") gpt.setReqPort(self.interconnect.cpu_side_ports) @@ -103,19 +115,21 @@ def get_inputs(): argparser = argparse.ArgumentParser() argparser.add_argument("num_mpus", type=int) argparser.add_argument("vertex_cache_line_size", type=int) + argparser.add_argument("cache_size", type=str) argparser.add_argument("graph_path", type=str) argparser.add_argument("init_addr", type=int) argparser.add_argument("init_value", type=int) args = argparser.parse_args() - return args.num_mpus, args.vertex_cache_line_size, \ + print("******* ", args.cache_size) + return args.num_mpus, args.vertex_cache_line_size, args.cache_size, \ args.graph_path, args.init_addr, args.init_value if __name__ == "__m5_main__": - num_mpus, vertex_cache_line_size, \ + num_mpus, vertex_cache_line_size, cache_size, \ graph_path, first_addr, first_value = get_inputs() print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}") - system = SEGA(num_mpus, vertex_cache_line_size, \ + system = SEGA(num_mpus, vertex_cache_line_size, cache_size, \ graph_path, first_addr, first_value) root = Root(full_system = False, system = system) From 25ded8a0636ea641d9da9a8cbe913f91e9f0c08b Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Thu, 1 Sep 2022 21:24:19 -0700 Subject: [PATCH 149/247] Adding Real memory for EM --- configs/accl/sega.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 2c44c1f7eb..e9286deafc 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -20,7 +20,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size): return ret class GPT(SubSystem): - def __init__(self, edge_memory_size, cache_size: str): + def __init__(self, edge_memory_size, cache_size: str, i): super().__init__() self.wl_engine = WLEngine(update_queue_size=64, register_file_size=32) @@ -40,11 +40,13 @@ def __init__(self, edge_memory_size, cache_size: str): # self.vertex_mem_ctrl = SimpleMemory(latency="30ns", # latency_var="0ns", # bandwidth="19.2GiB/s") - self.edge_mem_ctrl = SimpleMemory(latency="30ns", - latency_var="0ns", - bandwidth="19.2GiB/s", - range=AddrRange(edge_memory_size), - in_addr_map=False) + edge_interface = DDR4_2400_8x8( + device_size = edge_memory_size, + image_file = f"{graph_path}/edgelist_{i}", + in_addr_map=False) + edge_ctrl = MemCtrl() + edge_ctrl.dram = edge_interface + self.edge_mem_ctrl = edge_ctrl self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port self.push_engine.mem_port = self.edge_mem_ctrl.port @@ -67,7 +69,7 @@ def set_vertex_range(self, vertex_range): # self.vertex_mem_ctrl.range = vertex_range self.vertex_mem_ctrl.dram.range = vertex_range def set_edge_image(self, edge_image): - self.edge_mem_ctrl.image_file = edge_image + self.edge_mem_ctrl.dram.image_file = edge_image class SEGA(System): def __init__(self, @@ -101,7 +103,7 @@ def __init__(self, gpts = [] for i in range(num_mpus): - gpt = GPT("8GiB", cache_size) + gpt = GPT("8GiB", cache_size, i) gpt.set_vertex_range(vertex_ranges[i]) gpt.set_edge_image(f"{graph_path}/edgelist_{i}") gpt.setReqPort(self.interconnect.cpu_side_ports) From 0f69be29a97f915680b809fb3febc19543c60c99 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 1 Sep 2022 21:38:00 -0700 Subject: [PATCH 150/247] Fixing style. --- configs/accl/sega.py | 37 ++++++++++++++----------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index e9286deafc..1e360676cb 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -20,7 +20,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size): return ret class GPT(SubSystem): - def __init__(self, edge_memory_size, cache_size: str, i): + def __init__(self, edge_memory_size, cache_size: str): super().__init__() self.wl_engine = WLEngine(update_queue_size=64, register_file_size=32) @@ -31,18 +31,14 @@ def __init__(self, edge_memory_size, cache_size: str, i): self.push_engine = PushEngine(push_req_queue_size=32, attached_memory_atom_size=64, resp_queue_size=64) - - vertex_interface = HBM_1000_4H_1x128() - # vertex_interface.range = self._vertex_ranges[i] + + vertex_interface = HBM_1000_4H_1x128(burst_length=2) ctrl = MemCtrl() ctrl.dram = vertex_interface self.vertex_mem_ctrl = ctrl - # self.vertex_mem_ctrl = SimpleMemory(latency="30ns", - # latency_var="0ns", - # bandwidth="19.2GiB/s") + edge_interface = DDR4_2400_8x8( - device_size = edge_memory_size, - image_file = f"{graph_path}/edgelist_{i}", + device_size = edge_memory_size, in_addr_map=False) edge_ctrl = MemCtrl() edge_ctrl.dram = edge_interface @@ -74,7 +70,6 @@ def set_edge_image(self, edge_image): class SEGA(System): def __init__(self, num_mpus, - vertex_cache_line_size, cache_size, graph_path, first_addr, @@ -83,7 +78,7 @@ def __init__(self, self.clk_domain = SrcClockDomain() self.clk_domain.clock = '1GHz' self.clk_domain.voltage_domain = VoltageDomain() - self.cache_line_size = vertex_cache_line_size + self.cache_line_size = 32 self.mem_mode = "timing" self.interconnect = NoncoherentXBar(frontend_latency=1, @@ -95,15 +90,14 @@ def __init__(self, image_file=f"{graph_path}/vertices") self.ctrl.req_port = self.interconnect.cpu_side_ports - # vertex_ranges = interleave_addresses(AddrRange("4GiB"), num_mpus, vertex_cache_line_size) vertex_ranges = interleave_addresses( - AddrRange(start=0, size="4GiB"),\ - num_mpus,\ - vertex_cache_line_size) + AddrRange(start=0, size="4GiB"), + num_mpus, + 32) gpts = [] for i in range(num_mpus): - gpt = GPT("8GiB", cache_size, i) + gpt = GPT("8GiB", cache_size) gpt.set_vertex_range(vertex_ranges[i]) gpt.set_edge_image(f"{graph_path}/edgelist_{i}") gpt.setReqPort(self.interconnect.cpu_side_ports) @@ -116,23 +110,20 @@ def __init__(self, def get_inputs(): argparser = argparse.ArgumentParser() argparser.add_argument("num_mpus", type=int) - argparser.add_argument("vertex_cache_line_size", type=int) argparser.add_argument("cache_size", type=str) argparser.add_argument("graph_path", type=str) argparser.add_argument("init_addr", type=int) argparser.add_argument("init_value", type=int) args = argparser.parse_args() - print("******* ", args.cache_size) - return args.num_mpus, args.vertex_cache_line_size, args.cache_size, \ + + return args.num_mpus, args.cache_size, \ args.graph_path, args.init_addr, args.init_value if __name__ == "__m5_main__": - num_mpus, vertex_cache_line_size, cache_size, \ - graph_path, first_addr, first_value = get_inputs() + num_mpus, cache_size, graph_path, first_addr, first_value = get_inputs() print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}") - system = SEGA(num_mpus, vertex_cache_line_size, cache_size, \ - graph_path, first_addr, first_value) + system = SEGA(num_mpus, cache_size, graph_path, first_addr, first_value) root = Root(full_system = False, system = system) m5.instantiate() From 16bb60f064fadacb1a8cb62eaf6bc0d0a6aacffd Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 1 Sep 2022 21:44:37 -0700 Subject: [PATCH 151/247] Khoshgelation. --- configs/accl/sega.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 1e360676cb..b023507a39 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -22,27 +22,21 @@ def interleave_addresses(plain_range, num_channels, cache_line_size): class GPT(SubSystem): def __init__(self, edge_memory_size, cache_size: str): super().__init__() - self.wl_engine = WLEngine(update_queue_size=64, + self.wl_engine = WLEngine(update_queue_size=32, register_file_size=32) self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32, cache_size=cache_size, num_mshr_entry=32, - num_tgts_per_mshr=16) + num_tgts_per_mshr=32) self.push_engine = PushEngine(push_req_queue_size=32, attached_memory_atom_size=64, resp_queue_size=64) - vertex_interface = HBM_1000_4H_1x128(burst_length=2) - ctrl = MemCtrl() - ctrl.dram = vertex_interface - self.vertex_mem_ctrl = ctrl + self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2)) - edge_interface = DDR4_2400_8x8( - device_size = edge_memory_size, - in_addr_map=False) - edge_ctrl = MemCtrl() - edge_ctrl.dram = edge_interface - self.edge_mem_ctrl = edge_ctrl + self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8( + range=AddrRange(edge_memory_size), + in_addr_map=False)) self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port self.push_engine.mem_port = self.edge_mem_ctrl.port @@ -62,7 +56,6 @@ def setReqPort(self, port): self.mpu.out_port = port def set_vertex_range(self, vertex_range): - # self.vertex_mem_ctrl.range = vertex_range self.vertex_mem_ctrl.dram.range = vertex_range def set_edge_image(self, edge_image): self.edge_mem_ctrl.dram.image_file = edge_image From 99f997f387edb67177ee3789522db4d0f0f986be Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 2 Sep 2022 07:47:19 -0700 Subject: [PATCH 152/247] Adding new stats. --- configs/accl/sega.py | 3 +- src/accl/graph/sega/CoalesceEngine.py | 2 + src/accl/graph/sega/coalesce_engine.cc | 71 +++++++++++++++++--------- src/accl/graph/sega/coalesce_engine.hh | 8 +-- 4 files changed, 56 insertions(+), 28 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index b023507a39..5cf557719f 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -27,7 +27,8 @@ def __init__(self, edge_memory_size, cache_size: str): self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32, cache_size=cache_size, num_mshr_entry=32, - num_tgts_per_mshr=32) + num_tgts_per_mshr=32, + max_resp_per_cycle=4) self.push_engine = PushEngine(push_req_queue_size=32, attached_memory_atom_size=64, resp_queue_size=64) diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index 14902ef352..2cc756ff3f 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -37,3 +37,5 @@ class CoalesceEngine(BaseMemoryEngine): cache_size = Param.MemorySize("Size of the internal SRAM array.") num_mshr_entry = Param.Int("Number of MSHR entries.") num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.") + max_resp_per_cycle = Param.Int("Maximum number of vertices to send to " + "requestor in each cycle. Used to limit b/w.") diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index ba7878be7a..1715d637f1 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -49,6 +49,7 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), + maxRespPerCycle(params.max_resp_per_cycle), _workCount(0), numPullsReceived(0), nextMemoryEvent([this] { processNextMemoryEvent(); @@ -141,7 +142,7 @@ CoalesceEngine::recvWLRead(Addr addr) // Can't just schedule the nextResponseEvent for latency cycles in // the future. responseQueue.push_back(std::make_tuple( - addr, cacheBlocks[block_index].items[wl_offset])); + addr, cacheBlocks[block_index].items[wl_offset], curTick())); DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " "to responseQueue. responseQueue.size = %d.\n", @@ -197,6 +198,7 @@ CoalesceEngine::recvWLRead(Addr addr) "cacheBlocks[%d].\n", __func__, block_index); } MSHR[block_index].push_back(addr); + stats.mshrEntryLength.sample(MSHR[block_index].size()); DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " "for cacheBlocks[%d].\n", __func__, addr, block_index); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, @@ -312,6 +314,7 @@ CoalesceEngine::recvWLRead(Addr addr) } // cacheBlocks[block_index].hasConflict = true; MSHR[block_index].push_back(addr); + stats.mshrEntryLength.sample(MSHR[block_index].size()); DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " "for cacheBlocks[%d].\n", __func__, addr, block_index); stats.readMisses++; @@ -344,6 +347,7 @@ CoalesceEngine::recvWLRead(Addr addr) DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for" " Addr: %lu.\n", __func__, block_index, addr); MSHR[block_index].push_back(addr); + stats.mshrEntryLength.sample(MSHR[block_index].size()); DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " "for cacheBlocks[%d].\n", __func__, addr, block_index); memoryFunctionQueue.emplace_back( @@ -382,11 +386,11 @@ CoalesceEngine::recvWLRead(Addr addr) DPRINTF(CoalesceEngine, "%s: There is room for another target " "for cacheBlocks[%d].\n", __func__, block_index); - // cacheBlocks[block_index].hasConflict = true; // TODO: Might want to differentiate between different misses. stats.readMisses++; MSHR[block_index].push_back(addr); + stats.mshrEntryLength.sample(MSHR[block_index].size()); DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets for " "cacheBlocks[%d].\n", __func__, addr, block_index); stats.numVertexReads++; @@ -481,7 +485,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) "packet.\n",__func__, miss_addr, block_index); // TODO: Make this block of code into a function responseQueue.push_back(std::make_tuple(miss_addr, - cacheBlocks[block_index].items[wl_offset])); + cacheBlocks[block_index].items[wl_offset], curTick())); DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " "to responseQueue. responseQueue.size = %d.\n", __func__, miss_addr, @@ -519,22 +523,36 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) void CoalesceEngine::processNextResponseEvent() { + int num_responses_sent = 0; + Addr addr_response; WorkListItem worklist_response; - - std::tie(addr_response, worklist_response) = responseQueue.front(); - owner->handleIncomingWL(addr_response, worklist_response); - DPRINTF(CoalesceEngine, - "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n", - __func__, worklist_response.to_string(), addr_response); - - responseQueue.pop_front(); - DPRINTF(SEGAStructureSize, "%s: Popped a response from responseQueue. " - "responseQueue.size = %d.\n", __func__, - responseQueue.size()); - DPRINTF(CoalesceEngine, "%s: Popped a response from responseQueue. " - "responseQueue.size = %d.\n", __func__, - responseQueue.size()); + Tick response_queueing_tick; + while(true) { + std::tie(addr_response, worklist_response, response_queueing_tick) = + responseQueue.front(); + Tick waiting_ticks = curTick() - response_queueing_tick; + if (ticksToCycles(waiting_ticks) < 1) { + break; + } + owner->handleIncomingWL(addr_response, worklist_response); + num_responses_sent++; + DPRINTF(CoalesceEngine, + "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n", + __func__, worklist_response.to_string(), addr_response); + + responseQueue.pop_front(); + DPRINTF(SEGAStructureSize, "%s: Popped a response from responseQueue. " + "responseQueue.size = %d.\n", __func__, + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Popped a response from responseQueue. " + "responseQueue.size = %d.\n", __func__, + responseQueue.size()); + if ((num_responses_sent >= maxRespPerCycle) || + (responseQueue.empty())) { + break; + } + } if ((!nextResponseEvent.scheduled()) && (!responseQueue.empty())) { @@ -694,9 +712,9 @@ CoalesceEngine::processNextApplyEvent() if (needsPush[bit_index_base + index] == 0) { _workCount++; needsPush[bit_index_base + index] = 1; - } - if (!owner->running()) { - owner->start(); + if (!owner->running()) { + owner->start(); + } } } } @@ -997,10 +1015,10 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "Number of cache hit under misses."), ADD_STAT(readRejections, statistics::units::Count::get(), "Number of cache rejections."), - ADD_STAT(falseApplySchedules, statistics::units::Count::get(), - "Number of failed apply schedules."), - ADD_STAT(falseEvictSchedules, statistics::units::Count::get(), - "Number of failed evict schedules.") + ADD_STAT(hitRate, statistics::units::Ratio::get(), + "Hit rate in the cache."), + ADD_STAT(mshrEntryLength, statistics::units::Count::get(), + "Histogram on the length of the mshr entries.") { } @@ -1008,6 +1026,11 @@ void CoalesceEngine::CoalesceStats::regStats() { using namespace statistics; + + mshrEntryLength.init(64); + + hitRate = (readHits + readHitUnderMisses) / + (readHits + readHitUnderMisses + readMisses); } } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 75c36f9c03..641ed327bb 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -102,7 +102,8 @@ class CoalesceEngine : public BaseMemoryEngine int numMSHREntries; int numTgtsPerMSHR; std::unordered_map> MSHR; - std::deque> responseQueue; + int maxRespPerCycle; + std::deque> responseQueue; int _workCount; int numPullsReceived; @@ -144,8 +145,9 @@ class CoalesceEngine : public BaseMemoryEngine statistics::Scalar readMisses; statistics::Scalar readHitUnderMisses; statistics::Scalar readRejections; - statistics::Scalar falseApplySchedules; - statistics::Scalar falseEvictSchedules; + + statistics::Formula hitRate; + statistics::Histogram mshrEntryLength; }; CoalesceStats stats; From c8a4614a803d97b0c714637cc3196e8df646338a Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 4 Sep 2022 20:42:43 -0700 Subject: [PATCH 153/247] Fixing asserion error on busyMask. --- configs/accl/sega.py | 2 +- src/accl/graph/sega/busyMaskErr | 16 ++++++++++++++++ src/accl/graph/sega/coalesce_engine.cc | 7 ++++++- 3 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 src/accl/graph/sega/busyMaskErr diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 5cf557719f..3fa5b99b3a 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -20,7 +20,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size): return ret class GPT(SubSystem): - def __init__(self, edge_memory_size, cache_size: str): + def __init__(self, edge_memory_size: str, cache_size: str): super().__init__() self.wl_engine = WLEngine(update_queue_size=32, register_file_size=32) diff --git a/src/accl/graph/sega/busyMaskErr b/src/accl/graph/sega/busyMaskErr new file mode 100644 index 0000000000..316fcd37d9 --- /dev/null +++ b/src/accl/graph/sega/busyMaskErr @@ -0,0 +1,16 @@ +gem5/build/NULL/gem5.opt -re --outdir=debug --debug-flags=CacheBlockState gem5/configs/accl/sega.py 1 1KiB /home/fariborz/SEGA/graphs/test/scale_21/binaries/mpu_1/ 0 0 + +32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}. +32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}. +32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}. +32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964145000}. +32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}. +32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}. +32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlock[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}. +32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}. +32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}. +32964147000: system.gpts.coalesce_engine: processNextWriteBack: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}. + +// This assertion would be hit although it should not. +// It is fixed by a hack in recvWLRead when hit in the cache. +assert(cacheBlocks[block_index].busyMask == 0); diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 1715d637f1..3ff867c274 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -162,7 +162,12 @@ CoalesceEngine::recvWLRead(Addr addr) // and skip the process if the respective bit is set to false. cacheBlocks[block_index].pendingApply = false; cacheBlocks[block_index].pendingWB = false; - cacheBlocks[block_index].lastChangedTick = curTick(); + // HACK: If a read happens on the same cycle as another operation such + // apply setLastChangedTick to half a cycle later so that operations + // scheduled by the original operation (apply in this example) are + // invalidated. For more details refer to "accl/graph/sega/busyMaskErr" + cacheBlocks[block_index].lastChangedTick = + curTick() + (Tick) (clockPeriod() / 2); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); From 9ad5fa2f9175be1f2254bc2a0d7b92764b71d96f Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Mon, 5 Sep 2022 14:27:49 -0700 Subject: [PATCH 154/247] Fixing finding work in coalesce engine. --- src/accl/graph/sega/coalesce_engine.cc | 90 ++++++++++++++------------ src/accl/graph/sega/coalesce_engine.hh | 3 +- src/accl/graph/sega/mpu.cc | 4 +- 3 files changed, 54 insertions(+), 43 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 3ff867c274..7a52d29c98 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -50,7 +50,7 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), maxRespPerCycle(params.max_resp_per_cycle), - _workCount(0), numPullsReceived(0), + _workCount(0), numPullsReceived(0), startSearchIndex(0), nextMemoryEvent([this] { processNextMemoryEvent(); }, name() + ".nextMemoryEvent"), @@ -79,6 +79,9 @@ CoalesceEngine::registerMPU(MPU* mpu) bool CoalesceEngine::done() { + bool push_none = needsPush.none(); + DPRINTF(CoalesceEngine, "%s: needsPush.none: %s.\n", + __func__, push_none ? "true" : "false"); return applyQueue.empty() && needsPush.none() && memoryFunctionQueue.empty() && (onTheFlyReqs == 0); } @@ -885,41 +888,46 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) } } -std::tuple -CoalesceEngine::getOptimalBitVectorSlice() +std::tuple +CoalesceEngine::getOptimalPullAddr() { - bool hit_in_cache = false; - int slice_base = -1; - - // int score = 0; - // int max_score_possible = 3 * numElementsPerLine; - for (int it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) { - // int current_score = 0; + int it = startSearchIndex; + int initial_search_index = startSearchIndex; + while (true) { uint32_t current_popcount = 0; for (int i = 0; i < numElementsPerLine; i++) { current_popcount += needsPush[it + i]; } - if (current_popcount == 0) { - continue; + if (current_popcount != 0) { + Addr addr = getBlockAddrFromBitIndex(it); + int block_index = getBlockIndex(addr); + // Only if it is in cache and it is in idle state. + if ((cacheBlocks[block_index].addr == addr) && + (cacheBlocks[block_index].valid) && + (cacheBlocks[block_index].busyMask == 0) && + (!cacheBlocks[block_index].pendingApply) && + (!cacheBlocks[block_index].pendingWB)) { + assert(!cacheBlocks[block_index].needsApply); + assert(!cacheBlocks[block_index].pendingData); + startSearchIndex = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE; + return std::make_tuple(true, it, addr); + // Otherwise if it is in memory + } else if (cacheBlocks[block_index].addr != addr) { + if (pendingVertexPullReads.find(addr) != + pendingVertexPullReads.end()) { + startSearchIndex = + (it + numElementsPerLine) % MAX_BITVECTOR_SIZE; + return std::make_tuple(true, it, addr); + } + } } - // current_score += current_popcount; - Addr addr = getBlockAddrFromBitIndex(it); - int block_index = getBlockIndex(addr); - // Idle state: valid && !pendingApply && !pendingWB - if ((cacheBlocks[block_index].addr == addr) && - (cacheBlocks[block_index].valid) && - (cacheBlocks[block_index].busyMask == 0) && - (!cacheBlocks[block_index].pendingApply) && - (!cacheBlocks[block_index].pendingWB)) { - assert(!cacheBlocks[block_index].needsApply); - assert(!cacheBlocks[block_index].pendingData); - return std::make_tuple(true, it); - } else if (cacheBlocks[block_index].addr != addr) { - return std::make_tuple(false, it); + it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE; + if (it == initial_search_index) { + break; } } - - return std::make_tuple(hit_in_cache, slice_base); + // return garbage + return std::make_tuple(false, -1, 0); } void @@ -927,10 +935,10 @@ CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick) { bool hit_in_cache; int slice_base; - std::tie(hit_in_cache, slice_base) = getOptimalBitVectorSlice(); + Addr addr; + std::tie(hit_in_cache, slice_base, addr) = getOptimalPullAddr(); if (slice_base != -1) { - Addr addr = getBlockAddrFromBitIndex(slice_base); int block_index = getBlockIndex(addr); if (hit_in_cache) { assert(cacheBlocks[block_index].valid); @@ -958,10 +966,6 @@ CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick) onTheFlyReqs++; pendingVertexPullReads.insert(addr); - // TODO: Set a tracking structure so that nextMemoryReadEvent knows - // It does not have to read this address anymore. It can simply set - // a flag to true (maybe not even needed just look if the cache has a - // line allocated for it in the cacheBlocks). } numPullsReceived--; } @@ -993,14 +997,18 @@ CoalesceEngine::recvMemRetry() void CoalesceEngine::recvVertexPull() { + bool should_schedule = (numPullsReceived == 0); numPullsReceived++; - memoryFunctionQueue.emplace_back( - [this] (int slice_base, Tick schedule_tick) { - processNextVertexPull(slice_base, schedule_tick); - }, 0, curTick()); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); + + if (should_schedule) { + memoryFunctionQueue.emplace_back( + [this] (int slice_base, Tick schedule_tick) { + processNextVertexPull(slice_base, schedule_tick); + }, 0, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } } } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 641ed327bb..92c28ae11e 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -107,13 +107,14 @@ class CoalesceEngine : public BaseMemoryEngine int _workCount; int numPullsReceived; + int startSearchIndex; UniqueFIFO applyQueue; std::bitset needsPush; int getBlockIndex(Addr addr); int getBitIndexBase(Addr addr); Addr getBlockAddrFromBitIndex(int index); - std::tuple getOptimalBitVectorSlice(); + std::tuple getOptimalPullAddr(); std::unordered_set pendingVertexPullReads; diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc index 7b1727587a..63aa474542 100644 --- a/src/accl/graph/sega/mpu.cc +++ b/src/accl/graph/sega/mpu.cc @@ -194,7 +194,9 @@ MPU::sendPacket(PacketPtr pkt) void MPU::recvDoneSignal() { - centeralController->recvDoneSignal(); + if (done()) { + centeralController->recvDoneSignal(); + } } bool From d57d301f767ea1ed4268b6a6293d7c0c4ee040c5 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 6 Sep 2022 14:21:37 -0700 Subject: [PATCH 155/247] Fixing choosing work in coalesce engine. --- src/accl/graph/sega/SConscript | 2 - src/accl/graph/sega/coalesce_engine.cc | 247 ++++++++++++++++++------- src/accl/graph/sega/coalesce_engine.hh | 18 +- src/accl/graph/sega/push_engine.cc | 3 - 4 files changed, 194 insertions(+), 76 deletions(-) diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index 42a8d84ad5..5d48b46fba 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -43,13 +43,11 @@ Source('wl_engine.cc') DebugFlag('ApplyUpdates') DebugFlag('BaseMemoryEngine') -DebugFlag('BitVector') DebugFlag('CenteralController') DebugFlag('CacheBlockState') DebugFlag('CoalesceEngine') DebugFlag('PushEngine') DebugFlag('SEGAStructureSize') -DebugFlag('TempFlag') DebugFlag('WLEngine') CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 7a52d29c98..cf0e2872f6 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -33,7 +33,6 @@ #include "accl/graph/sega/mpu.hh" #include "base/intmath.hh" #include "debug/ApplyUpdates.hh" -#include "debug/BitVector.hh" #include "debug/CacheBlockState.hh" #include "debug/CoalesceEngine.hh" #include "debug/SEGAStructureSize.hh" @@ -80,7 +79,7 @@ bool CoalesceEngine::done() { bool push_none = needsPush.none(); - DPRINTF(CoalesceEngine, "%s: needsPush.none: %s.\n", + DPRINTF(CoalesceEngine, "%s: needsPush.none: %s.\n", __func__, push_none ? "true" : "false"); return applyQueue.empty() && needsPush.none() && memoryFunctionQueue.empty() && (onTheFlyReqs == 0); @@ -428,26 +427,23 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) (cacheBlocks[block_index].valid))); // We have read the address to send the wl and it is not in the // cache. Simply send the items to the PushEngine. + + DPRINTF(CoalesceEngine, "%s: Received read response for pull read " + "for addr %lu.\n", __func__, addr); int it = getBitIndexBase(addr); - DPRINTF(CoalesceEngine, "%s: Received read response for retry " - "for addr %lu. It was not found in the cache.\n", - __func__, addr); + uint64_t send_mask = pendingVertexPullReads[addr]; WorkListItem* items = pkt->getPtr(); // No applying of the line needed. - DPRINTF(BitVector, "%s: needsPush.count: %d.\n", - __func__, needsPush.count()); for (int i = 0; i < numElementsPerLine; i++) { Addr vertex_addr = addr + i * sizeof(WorkListItem); - if (needsPush[it + i] == 1) { - _workCount--; + uint64_t vertex_send_mask = send_mask & (1 << i); + if (vertex_send_mask != 0) { + assert(needsPush[it + i] == 1); needsPush[it + i] = 0; + _workCount--; owner->recvVertexPush(vertex_addr, items[i]); - break; } } - DPRINTF(BitVector, "%s: needsPush.count: %d.\n", - __func__, needsPush.count()); - pendingVertexPullReads.erase(addr); delete pkt; return true; @@ -720,6 +716,7 @@ CoalesceEngine::processNextApplyEvent() if (needsPush[bit_index_base + index] == 0) { _workCount++; needsPush[bit_index_base + index] = 1; + activeBits.push_back(bit_index_base + index); if (!owner->running()) { owner->start(); } @@ -888,19 +885,78 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) } } -std::tuple +// std::tuple +// CoalesceEngine::getOptimalPullAddr() +// { +// int it = startSearchIndex; +// int initial_search_index = startSearchIndex; +// while (true) { +// uint32_t current_popcount = 0; +// for (int i = 0; i < numElementsPerLine; i++) { +// current_popcount += needsPush[it + i]; +// } +// if (current_popcount != 0) { +// Addr addr = getBlockAddrFromBitIndex(it); +// int block_index = getBlockIndex(addr); +// // Only if it is in cache and it is in idle state. +// if ((cacheBlocks[block_index].addr == addr) && +// (cacheBlocks[block_index].valid) && +// (cacheBlocks[block_index].busyMask == 0) && +// (!cacheBlocks[block_index].pendingApply) && +// (!cacheBlocks[block_index].pendingWB)) { +// assert(!cacheBlocks[block_index].needsApply); +// assert(!cacheBlocks[block_index].pendingData); +// startSearchIndex = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE; +// return std::make_tuple(true, it, addr); +// // Otherwise if it is in memory +// } else if (cacheBlocks[block_index].addr != addr) { +// if (pendingVertexPullReads.find(addr) != +// pendingVertexPullReads.end()) { +// startSearchIndex = +// (it + numElementsPerLine) % MAX_BITVECTOR_SIZE; +// return std::make_tuple(true, it, addr); +// } +// } +// } +// it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE; +// if (it == initial_search_index) { +// break; +// } +// } +// // return garbage +// return std::make_tuple(false, -1, 0); +// } + +std::tuple CoalesceEngine::getOptimalPullAddr() { - int it = startSearchIndex; - int initial_search_index = startSearchIndex; - while (true) { - uint32_t current_popcount = 0; - for (int i = 0; i < numElementsPerLine; i++) { - current_popcount += needsPush[it + i]; - } - if (current_popcount != 0) { - Addr addr = getBlockAddrFromBitIndex(it); - int block_index = getBlockIndex(addr); + int visited_bits = 0; + int num_intial_active_bits = activeBits.size(); + while (visited_bits < num_intial_active_bits) { + int index = activeBits.front(); + int base_index = roundDown(index, numElementsPerLine); + int index_offset = index - base_index; + assert(needsPush[index] == 1); + assert(index_offset < numElementsPerLine); + + Addr addr = getBlockAddrFromBitIndex(base_index); + int block_index = getBlockIndex(addr); + if (pendingVertexPullReads.find(addr) != pendingVertexPullReads.end()) + { + uint64_t send_mask = pendingVertexPullReads[addr]; + uint64_t vertex_send_mask = send_mask & (1 << index_offset); + assert(vertex_send_mask == 0); + activeBits.pop_front(); + return std::make_tuple( + BitStatus::PENDING_READ, addr, index_offset); + /* + uint64_t send_mask = pendingVertexPullReads[addr]; + uint64_t vertex_send_mask = send_mask & (1 << index_offset); + assert(vertex_send_mask = 0); + send_mask |= (1 << index_offset); + pendingVertexPullReads[addr] = send_mask; + */ + } else { // Only if it is in cache and it is in idle state. if ((cacheBlocks[block_index].addr == addr) && (cacheBlocks[block_index].valid) && @@ -909,67 +965,122 @@ CoalesceEngine::getOptimalPullAddr() (!cacheBlocks[block_index].pendingWB)) { assert(!cacheBlocks[block_index].needsApply); assert(!cacheBlocks[block_index].pendingData); - startSearchIndex = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE; - return std::make_tuple(true, it, addr); + activeBits.pop_front(); + return std::make_tuple( + BitStatus::IN_CACHE, block_index, index_offset); // Otherwise if it is in memory } else if (cacheBlocks[block_index].addr != addr) { - if (pendingVertexPullReads.find(addr) != - pendingVertexPullReads.end()) { - startSearchIndex = - (it + numElementsPerLine) % MAX_BITVECTOR_SIZE; - return std::make_tuple(true, it, addr); - } + activeBits.pop_front(); + return std::make_tuple( + BitStatus::IN_MEMORY, addr, index_offset); } } - it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE; - if (it == initial_search_index) { - break; - } + activeBits.pop_front(); + activeBits.push_back(index); + visited_bits++; } - // return garbage - return std::make_tuple(false, -1, 0); + + return std::make_tuple(BitStatus::GARBAGE, 0, 0); } +// void +// CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick) +// { +// bool hit_in_cache; +// int slice_base; +// Addr addr; + +// std::tie(hit_in_cache, slice_base, addr) = getOptimalPullAddr(); +// if (slice_base != -1) { +// int block_index = getBlockIndex(addr); +// if (hit_in_cache) { +// assert(cacheBlocks[block_index].valid); +// assert(cacheBlocks[block_index].busyMask == 0); + +// DPRINTF(BitVector, "%s: needsPush.count: %d.\n", +// __func__, needsPush.count()); +// for (int i = 0; i < numElementsPerLine; i++) { +// Addr vertex_addr = addr + i * sizeof(WorkListItem); +// if (needsPush[slice_base + i] == 1) { +// _workCount--; +// needsPush[slice_base + i] = 0; +// owner->recvVertexPush(vertex_addr, +// cacheBlocks[block_index].items[i]); +// break; +// } +// } +// DPRINTF(BitVector, "%s: needsPush.count: %d.\n", +// __func__, needsPush.count()); +// } else { +// PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); +// SenderState* sender_state = new SenderState(true); +// pkt->pushSenderState(sender_state); +// memPort.sendPacket(pkt); +// onTheFlyReqs++; +// pendingVertexPullReads.insert(addr); +// } +// numPullsReceived--; +// } + +// if (numPullsReceived > 0) { +// memoryFunctionQueue.emplace_back( +// [this] (int slice_base, Tick schedule_tick) { +// processNextVertexPull(slice_base, schedule_tick); +// }, 0, curTick()); +// DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input " +// "0 to memoryFunctionQueue.\n", __func__); +// } +// } + void -CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick) +CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) { - bool hit_in_cache; - int slice_base; - Addr addr; - - std::tie(hit_in_cache, slice_base, addr) = getOptimalPullAddr(); - if (slice_base != -1) { - int block_index = getBlockIndex(addr); - if (hit_in_cache) { - assert(cacheBlocks[block_index].valid); - assert(cacheBlocks[block_index].busyMask == 0); - - DPRINTF(BitVector, "%s: needsPush.count: %d.\n", - __func__, needsPush.count()); - for (int i = 0; i < numElementsPerLine; i++) { - Addr vertex_addr = addr + i * sizeof(WorkListItem); - if (needsPush[slice_base + i] == 1) { - _workCount--; - needsPush[slice_base + i] = 0; - owner->recvVertexPush(vertex_addr, - cacheBlocks[block_index].items[i]); - break; - } - } - DPRINTF(BitVector, "%s: needsPush.count: %d.\n", - __func__, needsPush.count()); - } else { + BitStatus bit_status; + Addr location; + int offset; + + std::tie(bit_status, location, offset) = getOptimalPullAddr(); + + if (bit_status != BitStatus::GARBAGE) { + if (bit_status == BitStatus::PENDING_READ) { + // renaming the outputs to thier local names. + Addr addr = location; + int index_offset = offset; + + uint64_t send_mask = pendingVertexPullReads[addr]; + uint64_t vertex_send_mask = send_mask & (1 << index_offset); + assert(vertex_send_mask == 0); + send_mask |= (1 << index_offset); + pendingVertexPullReads[addr] = send_mask; + } + if (bit_status == BitStatus::IN_CACHE) { + // renaming the outputs to their local names. + int block_index = (int) location; + int wl_offset = offset; + + Addr addr = cacheBlocks[block_index].addr; + Addr vertex_addr = addr + (wl_offset * sizeof(WorkListItem)); + int slice_base_index = getBitIndexBase(addr); + + needsPush[slice_base_index + wl_offset] = 0; + _workCount--; + owner->recvVertexPush( + vertex_addr, cacheBlocks[block_index].items[wl_offset]); + } + if (bit_status == BitStatus::IN_MEMORY) { + Addr addr = location; + int index_offset = offset; + uint64_t send_mask = (1 << index_offset); + assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end()); PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); SenderState* sender_state = new SenderState(true); pkt->pushSenderState(sender_state); memPort.sendPacket(pkt); onTheFlyReqs++; - - pendingVertexPullReads.insert(addr); + pendingVertexPullReads[addr] = send_mask; } numPullsReceived--; } - if (numPullsReceived > 0) { memoryFunctionQueue.emplace_back( [this] (int slice_base, Tick schedule_tick) { diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 92c28ae11e..fe7c83afb2 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -42,6 +42,14 @@ namespace gem5 { +enum BitStatus +{ + PENDING_READ, + IN_CACHE, + IN_MEMORY, + GARBAGE +}; + class MPU; class CoalesceEngine : public BaseMemoryEngine @@ -107,22 +115,26 @@ class CoalesceEngine : public BaseMemoryEngine int _workCount; int numPullsReceived; + // CLEAN: Replace with slice_base_queue int startSearchIndex; UniqueFIFO applyQueue; std::bitset needsPush; + std::deque activeBits; int getBlockIndex(Addr addr); int getBitIndexBase(Addr addr); Addr getBlockAddrFromBitIndex(int index); - std::tuple getOptimalPullAddr(); + std::tuple getOptimalPullAddr(); - std::unordered_set pendingVertexPullReads; + // A map from addr to sendMask. sendMask determines which bytes to + // send for push when getting the read response from memory. + std::unordered_map pendingVertexPullReads; MemoryEvent nextMemoryEvent; void processNextMemoryEvent(); void processNextRead(int block_index, Tick schedule_tick); void processNextWriteBack(int block_index, Tick schedule_tick); - void processNextVertexPull(int slice_base, Tick schedule_tick); + void processNextVertexPull(int ignore, Tick schedule_tick); std::deque, int, Tick>> memoryFunctionQueue; diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 0134133cfa..505d41b0b8 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -31,7 +31,6 @@ #include "accl/graph/sega/coalesce_engine.hh" #include "accl/graph/sega/mpu.hh" #include "debug/PushEngine.hh" -#include "debug/TempFlag.hh" #include "mem/packet_access.hh" #include "sim/sim_exit.hh" @@ -129,8 +128,6 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl) edgePointerQueue.emplace_back(start_addr, end_addr, sizeof(Edge), peerMemoryAtomSize, addr, (uint32_t) wl.prop); numPendingPulls--; - DPRINTF(TempFlag, "%s: Received {addr: %lu, wl: %s}.\n", - __func__, addr, wl.to_string()); if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) { schedule(nextVertexPullEvent, nextCycle()); } From 8d4f9b0e2bb82986db1d367e03cc6be48140d55c Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Wed, 27 Jul 2022 18:36:52 -0700 Subject: [PATCH 156/247] Adding support for synthetic traffic --- configs/accl/sega.py | 125 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 116 insertions(+), 9 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 3fa5b99b3a..8e901b6e6d 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -1,8 +1,35 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + import m5 +import os import argparse +import subprocess from math import log -import math from m5.objects import * def interleave_addresses(plain_range, num_channels, cache_line_size): @@ -103,21 +130,101 @@ def __init__(self, def get_inputs(): argparser = argparse.ArgumentParser() - argparser.add_argument("num_mpus", type=int) + argparser.add_argument("num_gpts", type=int) argparser.add_argument("cache_size", type=str) - argparser.add_argument("graph_path", type=str) + argparser.add_argument("vertex_cache_line_size", type=int) + argparser.add_argument("synthetic", type=bool) + argparser.add_argument("--scale", type=int) + argparser.add_argument("--deg", type=int) + argparser.add_argument("--graph", type=str) argparser.add_argument("init_addr", type=int) argparser.add_argument("init_value", type=int) + args = argparser.parse_args() - return args.num_mpus, args.cache_size, \ - args.graph_path, args.init_addr, args.init_value + if args.synthetic: + if (args.scale is None) or (args.deg is None): + raise ValueError("If synthetic is true, you should specify the" + "scale of the graph by --scale [scale] and the average" + "degree of the graph by --deg [average degree].") + else: + if args.graph is None: + raise ValueError("If synthetic is false, you should specify the " + "path to graph binaries by --graph [path to graph].") + return args if __name__ == "__m5_main__": - num_mpus, cache_size, graph_path, first_addr, first_value = get_inputs() - - print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}") - system = SEGA(num_mpus, cache_size, graph_path, first_addr, first_value) + input_args = get_inputs() + + image_path = None + if input_args.synthetic: + base_dir = os.environ.get("GRAPH_DIR", default="/tmp") + graph_gen = os.path.abspath(os.environ.get("GRAPH_GEN")) + graph_reader = os.environ.get("GRAPH_READER") + graph_sorter = os.environ.get("GRAPH_SORTER") + if graph_gen is None: + raise ValueError(f"No value for $GRAPH_GEN.") + if graph_reader is None: + raise ValueError(f"No value for $GRAPH_READER.") + if graph_sorter is None: + raise ValueError(f"No value for $GRAPH_SORTER") + + graph_path = os.path.join(base_dir, f"graph_{input_args.scale}_{input_args.deg}") + if not os.path.exists(graph_path): + print(f"{graph_path} does not exist already.") + os.mkdir(graph_path) + print(f"Created {graph_path}") + + if not "graph.txt" in os.listdir(graph_path): + print(f"graph.txt not found in {graph_path}") + subprocess.run([f"{graph_gen}", + f"{input_args.scale}", + f"{input_args.deg}", + f"{graph_path}/graph_unordered.txt"]) + print(f"Generated a graph with scale " + f"{input_args.scale} and deg {input_args.deg}") + subprocess.run(["python", + f"{graph_sorter}", + f"{graph_path}/graph_unordered.txt", + f"{graph_path}/graph.txt"]) + print(f"Sorted the graph here {graph_path}/graph_unordered.txt" + f" and saved in {graph_path}/graph.txt") + subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"]) + print(f"Deleted {graph_path}/graph_unordered.txt") + + if not "binaries" in os.listdir(graph_path): + print(f"binaries directory not found in {graph_path}") + os.mkdir(f"{graph_path}/binaries") + print(f"Created {graph_path}/binaries") + + if not f"gpts_{input_args.num_gpts}" in os.listdir(f"{graph_path}/binaries"): + print(f"gpts_{input_args.num_gpts} not found in {graph_path}/binaries") + os.mkdir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}") + print(f"Created {graph_path}/binaries/gpts_{input_args.num_gpts}") + + expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(input_args.num_gpts)] + if not all([binary in os.listdir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}") for binary in expected_bins]): + print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{input_args.num_gpts}") + for delete in os.scandir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}"): + os.remove(delete.path) + print(f"Deleted all the files in {graph_path}/binaries/gpts_{input_args.num_gpts}") + subprocess.run([f"{graph_reader}" , + f"{graph_path}/graph.txt", + "false", + f"{input_args.num_gpts}", + f"{input_args.vertex_cache_line_size}", + f"{graph_path}/binaries/gpts_{input_args.num_gpts}"]) + print(f"Created the graph binaries in " + f"{graph_path}/binaries/n{input_args.num_gpts}") + image_path = f"{graph_path}/binaries/gpts_{input_args.num_gpts}" + else: + image_path = input_args.graph + + system = SEGA(input_args.num_gpts, + input_args.cache_size, + image_path, + input_args.init_addr, + input_args.init_value) root = Root(full_system = False, system = system) m5.instantiate() From 7ddb4cf48879fca09694b983c46ae486bbf97bc2 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Wed, 27 Jul 2022 23:42:01 -0700 Subject: [PATCH 157/247] Adding workload as a parameter --- configs/accl/sega.py | 2 +- src/accl/graph/sega/PushEngine.py | 2 ++ src/accl/graph/sega/WLEngine.py | 2 ++ src/accl/graph/sega/push_engine.cc | 17 ++++++++++++++++- src/accl/graph/sega/push_engine.hh | 3 ++- src/accl/graph/sega/wl_engine.cc | 19 +++++++++++++++++-- src/accl/graph/sega/wl_engine.hh | 5 +++++ 7 files changed, 45 insertions(+), 5 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 8e901b6e6d..ddeae34e4e 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -215,7 +215,7 @@ def get_inputs(): f"{input_args.vertex_cache_line_size}", f"{graph_path}/binaries/gpts_{input_args.num_gpts}"]) print(f"Created the graph binaries in " - f"{graph_path}/binaries/n{input_args.num_gpts}") + f"{graph_path}/binaries/gpts_{input_args.num_gpts}") image_path = f"{graph_path}/binaries/gpts_{input_args.num_gpts}" else: image_path = input_args.graph diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py index f98f22ba9d..ad9ddfefcf 100644 --- a/src/accl/graph/sega/PushEngine.py +++ b/src/accl/graph/sega/PushEngine.py @@ -41,3 +41,5 @@ class PushEngine(BaseMemoryEngine): resp_queue_size = Param.Int("Size of the response queue in the " "push engine where it stores the " "edges read from memory") + + workload = Param.String("BFS", "Name of the workload") diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py index 52ca031260..a44352ab9b 100644 --- a/src/accl/graph/sega/WLEngine.py +++ b/src/accl/graph/sega/WLEngine.py @@ -40,3 +40,5 @@ class WLEngine(BaseReduceEngine): "WLEngine has. It can service as " "many updates as this queueu has " "entries at the same time.") # 4 is arbitrary + + workload = Param.String('BFS',"Name of the workload") \ No newline at end of file diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 505d41b0b8..9f13c00397 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -42,6 +42,7 @@ PushEngine::PushEngine(const Params& params): _running(false), numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size), onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size), + workload(params.workload), nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()), nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()), nextPushEvent([this] { processNextPushEvent(); }, name()), @@ -85,6 +86,20 @@ PushEngine::done() edgePointerQueue.empty(); } + +uint32_t +PushEngine::propagate(uint32_t value, uint32_t weight) +{ + uint32_t update; + if (workload == "BFS") { + update = value + 1; + } + else{ + panic("The workload %s is not supported", workload); + } + return update; +} + void PushEngine::start() { @@ -239,7 +254,7 @@ PushEngine::processNextPushEvent() __func__, curr_edge.to_string()); // TODO: Implement propagate function here - uint32_t update_value = curr_edge.value + 1; + uint32_t update_value = propagate(value, 1); PacketPtr update = createUpdatePacket( curr_edge.dst, update_value); diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 6f92b62be0..a64a5b1f5b 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -82,7 +82,6 @@ class PushEngine : public BaseMemoryEngine Addr src() { return _src; } uint32_t value() { return _value; } }; - struct PushInfo { Addr src; uint32_t value; @@ -103,6 +102,8 @@ class PushEngine : public BaseMemoryEngine int edgeQueueSize; std::deque> edgeQueue; + std::string workload; + uint32_t propagate(uint32_t value, uint32_t weight); template PacketPtr createUpdatePacket(Addr addr, T value); bool vertexSpace(); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 9890eeed76..855e36b413 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -41,6 +41,7 @@ WLEngine::WLEngine(const WLEngineParams& params): BaseReduceEngine(params), updateQueueSize(params.update_queue_size), registerFileSize(params.register_file_size), + workload(params.workload), nextReadEvent([this]{ processNextReadEvent(); }, name()), nextReduceEvent([this]{ processNextReduceEvent(); }, name()), stats(*this) @@ -58,6 +59,18 @@ WLEngine::done() return registerFile.empty() && updateQueue.empty(); } +uint32_t +WLEngine::reduce(uint32_t update, uint32_t value) +{ + uint32_t new_value; + if(workload == "BFS"){ + new_value = std::min(update, value); + } else{ + panic("Workload not implemented\n"); + } + return new_value; +} + bool WLEngine::handleIncomingUpdate(PacketPtr pkt) { @@ -138,7 +151,8 @@ WLEngine::processNextReadEvent() "addr: %lu in registerFile. registerFile[%lu] = %u.\n", __func__, update_addr, update_addr, registerFile[update_addr]); registerFile[update_addr] = - std::min(update_value, registerFile[update_addr]); + reduce(update_value, registerFile[update_addr]); + // std::min(update_value, registerFile[update_addr]); DPRINTF(WLEngine, "%s: Reduced the update_value: %u with the entry in" " registerFile. registerFile[%lu] = %u.\n", __func__, update_value, update_addr, registerFile[update_addr]); @@ -191,7 +205,8 @@ WLEngine::processNextReduceEvent() addr, workListFile[addr].to_string()); // TODO: Generalize this to reduce function rather than just min workListFile[addr].tempProp = - std::min(update_value, workListFile[addr].tempProp); + reduce(update_value, workListFile[addr].tempProp); + // std::min(update_value, workListFile[addr].tempProp); DPRINTF(WLEngine, "%s: Reduction done. workListFile[%lu] = %s.\n", __func__, addr, workListFile[addr].to_string()); stats.numReduce++; diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 4a0489b123..b03a3cdb87 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -47,6 +47,8 @@ class WLEngine : public BaseReduceEngine private: MPU* owner; + + int updateQueueSize; std::deque> updateQueue; @@ -55,6 +57,9 @@ class WLEngine : public BaseReduceEngine std::unordered_map workListFile; + std::string workload; + uint32_t reduce(uint32_t update, uint32_t value); + EventFunctionWrapper nextReadEvent; void processNextReadEvent(); From 302bc6e3e6be79a515890427c50b765a463441b1 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 7 Sep 2022 13:22:40 -0700 Subject: [PATCH 158/247] Adding workload as a parameter to coalesce engine. --- src/accl/graph/sega/CoalesceEngine.py | 5 ++ src/accl/graph/sega/coalesce_engine.cc | 120 ++++--------------------- src/accl/graph/sega/coalesce_engine.hh | 5 +- src/accl/graph/sega/push_engine.cc | 2 +- src/accl/graph/sega/wl_engine.cc | 2 - 5 files changed, 28 insertions(+), 106 deletions(-) diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index 2cc756ff3f..f6e997f1e3 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -35,7 +35,12 @@ class CoalesceEngine(BaseMemoryEngine): cxx_class = 'gem5::CoalesceEngine' cache_size = Param.MemorySize("Size of the internal SRAM array.") + num_mshr_entry = Param.Int("Number of MSHR entries.") + num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.") + max_resp_per_cycle = Param.Int("Maximum number of vertices to send to " "requestor in each cycle. Used to limit b/w.") + + workload = Param.String("BFS", "Name of the workload") diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index cf0e2872f6..a80d629737 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -49,7 +49,7 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), maxRespPerCycle(params.max_resp_per_cycle), - _workCount(0), numPullsReceived(0), startSearchIndex(0), + _workCount(0), numPullsReceived(0), workload(params.workload), nextMemoryEvent([this] { processNextMemoryEvent(); }, name() + ".nextMemoryEvent"), @@ -85,6 +85,18 @@ CoalesceEngine::done() memoryFunctionQueue.empty() && (onTheFlyReqs == 0); } +uint32_t +CoalesceEngine::reduce(uint32_t update, uint32_t value) +{ + uint32_t new_value; + if(workload == "BFS"){ + new_value = std::min(update, value); + } else{ + panic("Workload not implemented\n"); + } + return new_value; +} + // addr should be aligned to peerMemoryAtomSize int CoalesceEngine::getBlockIndex(Addr addr) @@ -700,8 +712,12 @@ CoalesceEngine::processNextApplyEvent() assert(cacheBlocks[block_index].busyMask == 0); for (int index = 0; index < numElementsPerLine; index++) { uint32_t current_prop = cacheBlocks[block_index].items[index].prop; - uint32_t new_prop = std::min(current_prop, - cacheBlocks[block_index].items[index].tempProp); + // NOTE: It might be the case that for workloads other than BFS, + // the reduce function here should be different to the reduce + // function defined in WLEngine. Think about the case of PR in + // detail. + uint32_t new_prop = reduce( + cacheBlocks[block_index].items[index].tempProp, current_prop); if (new_prop != current_prop) { cacheBlocks[block_index].items[index].tempProp = new_prop; cacheBlocks[block_index].items[index].prop = new_prop; @@ -885,48 +901,6 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) } } -// std::tuple -// CoalesceEngine::getOptimalPullAddr() -// { -// int it = startSearchIndex; -// int initial_search_index = startSearchIndex; -// while (true) { -// uint32_t current_popcount = 0; -// for (int i = 0; i < numElementsPerLine; i++) { -// current_popcount += needsPush[it + i]; -// } -// if (current_popcount != 0) { -// Addr addr = getBlockAddrFromBitIndex(it); -// int block_index = getBlockIndex(addr); -// // Only if it is in cache and it is in idle state. -// if ((cacheBlocks[block_index].addr == addr) && -// (cacheBlocks[block_index].valid) && -// (cacheBlocks[block_index].busyMask == 0) && -// (!cacheBlocks[block_index].pendingApply) && -// (!cacheBlocks[block_index].pendingWB)) { -// assert(!cacheBlocks[block_index].needsApply); -// assert(!cacheBlocks[block_index].pendingData); -// startSearchIndex = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE; -// return std::make_tuple(true, it, addr); -// // Otherwise if it is in memory -// } else if (cacheBlocks[block_index].addr != addr) { -// if (pendingVertexPullReads.find(addr) != -// pendingVertexPullReads.end()) { -// startSearchIndex = -// (it + numElementsPerLine) % MAX_BITVECTOR_SIZE; -// return std::make_tuple(true, it, addr); -// } -// } -// } -// it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE; -// if (it == initial_search_index) { -// break; -// } -// } -// // return garbage -// return std::make_tuple(false, -1, 0); -// } - std::tuple CoalesceEngine::getOptimalPullAddr() { @@ -949,13 +923,6 @@ CoalesceEngine::getOptimalPullAddr() activeBits.pop_front(); return std::make_tuple( BitStatus::PENDING_READ, addr, index_offset); - /* - uint64_t send_mask = pendingVertexPullReads[addr]; - uint64_t vertex_send_mask = send_mask & (1 << index_offset); - assert(vertex_send_mask = 0); - send_mask |= (1 << index_offset); - pendingVertexPullReads[addr] = send_mask; - */ } else { // Only if it is in cache and it is in idle state. if ((cacheBlocks[block_index].addr == addr) && @@ -983,55 +950,6 @@ CoalesceEngine::getOptimalPullAddr() return std::make_tuple(BitStatus::GARBAGE, 0, 0); } -// void -// CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick) -// { -// bool hit_in_cache; -// int slice_base; -// Addr addr; - -// std::tie(hit_in_cache, slice_base, addr) = getOptimalPullAddr(); -// if (slice_base != -1) { -// int block_index = getBlockIndex(addr); -// if (hit_in_cache) { -// assert(cacheBlocks[block_index].valid); -// assert(cacheBlocks[block_index].busyMask == 0); - -// DPRINTF(BitVector, "%s: needsPush.count: %d.\n", -// __func__, needsPush.count()); -// for (int i = 0; i < numElementsPerLine; i++) { -// Addr vertex_addr = addr + i * sizeof(WorkListItem); -// if (needsPush[slice_base + i] == 1) { -// _workCount--; -// needsPush[slice_base + i] = 0; -// owner->recvVertexPush(vertex_addr, -// cacheBlocks[block_index].items[i]); -// break; -// } -// } -// DPRINTF(BitVector, "%s: needsPush.count: %d.\n", -// __func__, needsPush.count()); -// } else { -// PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); -// SenderState* sender_state = new SenderState(true); -// pkt->pushSenderState(sender_state); -// memPort.sendPacket(pkt); -// onTheFlyReqs++; -// pendingVertexPullReads.insert(addr); -// } -// numPullsReceived--; -// } - -// if (numPullsReceived > 0) { -// memoryFunctionQueue.emplace_back( -// [this] (int slice_base, Tick schedule_tick) { -// processNextVertexPull(slice_base, schedule_tick); -// }, 0, curTick()); -// DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input " -// "0 to memoryFunctionQueue.\n", __func__); -// } -// } - void CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) { diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index fe7c83afb2..7503d69b76 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -115,8 +115,6 @@ class CoalesceEngine : public BaseMemoryEngine int _workCount; int numPullsReceived; - // CLEAN: Replace with slice_base_queue - int startSearchIndex; UniqueFIFO applyQueue; std::bitset needsPush; std::deque activeBits; @@ -130,6 +128,9 @@ class CoalesceEngine : public BaseMemoryEngine // send for push when getting the read response from memory. std::unordered_map pendingVertexPullReads; + std::string workload; + uint32_t reduce(uint32_t update, uint32_t value); + MemoryEvent nextMemoryEvent; void processNextMemoryEvent(); void processNextRead(int block_index, Tick schedule_tick); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 9f13c00397..625f836561 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -254,7 +254,7 @@ PushEngine::processNextPushEvent() __func__, curr_edge.to_string()); // TODO: Implement propagate function here - uint32_t update_value = propagate(value, 1); + uint32_t update_value = propagate(curr_edge.value, curr_edge.weight); PacketPtr update = createUpdatePacket( curr_edge.dst, update_value); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 855e36b413..5465769cff 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -152,7 +152,6 @@ WLEngine::processNextReadEvent() __func__, update_addr, update_addr, registerFile[update_addr]); registerFile[update_addr] = reduce(update_value, registerFile[update_addr]); - // std::min(update_value, registerFile[update_addr]); DPRINTF(WLEngine, "%s: Reduced the update_value: %u with the entry in" " registerFile. registerFile[%lu] = %u.\n", __func__, update_value, update_addr, registerFile[update_addr]); @@ -206,7 +205,6 @@ WLEngine::processNextReduceEvent() // TODO: Generalize this to reduce function rather than just min workListFile[addr].tempProp = reduce(update_value, workListFile[addr].tempProp); - // std::min(update_value, workListFile[addr].tempProp); DPRINTF(WLEngine, "%s: Reduction done. workListFile[%lu] = %s.\n", __func__, addr, workListFile[addr].to_string()); stats.numReduce++; From ab2362a81cfec8311e017d824c9d6208beec235d Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 8 Sep 2022 10:20:48 -0700 Subject: [PATCH 159/247] Adding stats. --- configs/accl/sega.py | 2 +- src/accl/graph/sega/coalesce_engine.cc | 21 ++++++++++++++------- src/accl/graph/sega/coalesce_engine.hh | 4 +++- src/accl/graph/sega/push_engine.cc | 7 ++++++- src/accl/graph/sega/push_engine.hh | 2 ++ src/accl/graph/sega/wl_engine.cc | 9 ++++++++- src/accl/graph/sega/wl_engine.hh | 1 + 7 files changed, 35 insertions(+), 11 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index ddeae34e4e..e8d76e7dad 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -159,7 +159,7 @@ def get_inputs(): image_path = None if input_args.synthetic: base_dir = os.environ.get("GRAPH_DIR", default="/tmp") - graph_gen = os.path.abspath(os.environ.get("GRAPH_GEN")) + graph_gen = os.environ.get("GRAPH_GEN") graph_reader = os.environ.get("GRAPH_READER") graph_sorter = os.environ.get("GRAPH_SORTER") if graph_gen is None: diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index a80d629737..dbe5e56f2d 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -210,7 +210,7 @@ CoalesceEngine::recvWLRead(Addr addr) DPRINTF(CoalesceEngine, "%s: Out of targets for " "cacheBlocks[%d]. Rejecting request.\n", __func__, block_index); - stats.readRejections++; + stats.mshrTargetShortage++; return false; } else { DPRINTF(CoalesceEngine, "%s: MSHR entries are available for " @@ -241,7 +241,7 @@ CoalesceEngine::recvWLRead(Addr addr) "Rejecting request.\n", __func__); // TODO: Break out read rejections into more than one stat // based on the cause of the rejection - stats.readRejections++; + stats.mshrEntryShortage++; return false; } else { DPRINTF(CoalesceEngine, "%s: MSHR " @@ -399,7 +399,7 @@ CoalesceEngine::recvWLRead(Addr addr) DPRINTF(CoalesceEngine, "%s: Out of targets for " "cacheBlocks[%d]. Rejecting request.\n", __func__, block_index); - stats.readRejections++; + stats.mshrTargetShortage++; return false; } DPRINTF(CoalesceEngine, "%s: There is room for another target " @@ -740,6 +740,8 @@ CoalesceEngine::processNextApplyEvent() } } } + stats.bitvectorLength.sample(needsPush.count()); + cacheBlocks[block_index].needsWB = true; cacheBlocks[block_index].needsApply = false; cacheBlocks[block_index].pendingApply = false; @@ -1055,12 +1057,16 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "Number of cache misses."), ADD_STAT(readHitUnderMisses, statistics::units::Count::get(), "Number of cache hit under misses."), - ADD_STAT(readRejections, statistics::units::Count::get(), - "Number of cache rejections."), + ADD_STAT(mshrEntryShortage, statistics::units::Count::get(), + "Number of cache rejections caused by entry shortage."), + ADD_STAT(mshrTargetShortage, statistics::units::Count::get(), + "Number of cache rejections caused by target shortage."), ADD_STAT(hitRate, statistics::units::Ratio::get(), "Hit rate in the cache."), ADD_STAT(mshrEntryLength, statistics::units::Count::get(), - "Histogram on the length of the mshr entries.") + "Histogram on the length of the mshr entries."), + ADD_STAT(bitvectorLength, statistics::units::Count::get(), + "Histogram of the length of the bitvector") { } @@ -1069,7 +1075,8 @@ CoalesceEngine::CoalesceStats::regStats() { using namespace statistics; - mshrEntryLength.init(64); + mshrEntryLength.init(coalesce.params().num_tgts_per_mshr); + bitvectorLength.init(64); hitRate = (readHits + readHitUnderMisses) / (readHits + readHitUnderMisses + readMisses); diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 7503d69b76..16c417fc60 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -158,10 +158,12 @@ class CoalesceEngine : public BaseMemoryEngine statistics::Scalar readHits; statistics::Scalar readMisses; statistics::Scalar readHitUnderMisses; - statistics::Scalar readRejections; + statistics::Scalar mshrEntryShortage; + statistics::Scalar mshrTargetShortage; statistics::Formula hitRate; statistics::Histogram mshrEntryLength; + statistics::Histogram bitvectorLength; }; CoalesceStats stats; diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 625f836561..855d666989 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -300,7 +300,10 @@ PushEngine::PushStats::PushStats(PushEngine &_push) : statistics::Group(&_push), push(_push), ADD_STAT(numUpdates, statistics::units::Count::get(), - "Number of sent updates.") + "Number of sent updates."), + ADD_STAT(TEPS, statistics::units::Rate::get(), + "Traversed Edges Per Second.") { } @@ -308,6 +311,8 @@ void PushEngine::PushStats::regStats() { using namespace statistics; + + TEPS = numUpdates / simSeconds; } } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index a64a5b1f5b..a5677067b8 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -127,6 +127,8 @@ class PushEngine : public BaseMemoryEngine PushEngine &push; statistics::Scalar numUpdates; + + statistics::Formula TEPS; }; PushStats stats; diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 5465769cff..a39905037e 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -144,6 +144,10 @@ WLEngine::processNextReadEvent() update_value, updateQueue.size(), updateQueueSize); owner->checkRetryReq(); } + } else { + DPRINTF(WLEngine, "%s: There are no free registers " + "available in the registerFile.\n", __func__); + stats.registerShortage++; } } else { // TODO: Generalize this to reduce function rather than just min @@ -231,7 +235,10 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl) ADD_STAT(numReduce, statistics::units::Count::get(), "Number of memory blocks read for vertecies"), ADD_STAT(registerFileCoalesce, statistics::units::Count::get(), - "Number of memory blocks read for vertecies") + "Number of memory blocks read for vertecies"), + ADD_STAT(registerShortage, statistics::units::Count::get(), + "Number of times updates were " + "stalled because of register shortage") { } diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index b03a3cdb87..2956e58666 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -76,6 +76,7 @@ class WLEngine : public BaseReduceEngine statistics::Scalar numReduce; statistics::Scalar registerFileCoalesce; + statistics::Scalar registerShortage; }; WorkListStats stats; From 40b01f05558c798a20e60b26822d9ca8241b47eb Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 11 Sep 2022 14:39:42 -0700 Subject: [PATCH 160/247] Separating graph generation from run script. --- configs/accl/graph-gen.py | 103 ++++++++++++++++++++++++++++++++++++++ configs/accl/sega.py | 96 +++-------------------------------- 2 files changed, 110 insertions(+), 89 deletions(-) create mode 100644 configs/accl/graph-gen.py diff --git a/configs/accl/graph-gen.py b/configs/accl/graph-gen.py new file mode 100644 index 0000000000..16985b3537 --- /dev/null +++ b/configs/accl/graph-gen.py @@ -0,0 +1,103 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import argparse +import subprocess + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("scale", type=int, help="The scale of the synth graph to generate.") + argparser.add_argument("deg", type=int, help="The average degree of the synth graph to generate.") + argparser.add_argument("num_gpts", type=int, help="Number gpts to create synth graph binaries for.") + + args = argparser.parse_args() + return args.scale, args.deg, args.num_gpts + +if __name__ == "__main__": + scale, deg, num_gpts = get_inputs() + + base_dir = os.environ.get("GRAPH_DIR", default="/tmp") + graph_gen = os.environ.get("GRAPH_GEN") + graph_reader = os.environ.get("GRAPH_READER") + graph_sorter = os.environ.get("GRAPH_SORTER") + if graph_gen is None: + raise ValueError(f"No value for $GRAPH_GEN.") + if graph_reader is None: + raise ValueError(f"No value for $GRAPH_READER.") + if graph_sorter is None: + raise ValueError(f"No value for $GRAPH_SORTER") + + graph_path = os.path.join(base_dir, f"graph_{scale}_{deg}") + if not os.path.exists(graph_path): + print(f"{graph_path} does not exist already.") + os.mkdir(graph_path) + print(f"Created {graph_path}") + + if not "graph.txt" in os.listdir(graph_path): + print(f"graph.txt not found in {graph_path}") + for delete in os.scandir(graph_path): + os.remove(delete.path) + print(f"Deleted everything in {graph_path}") + subprocess.run([f"{graph_gen}", + f"{scale}", + f"{deg}", + f"{graph_path}/graph_unordered.txt"]) + print(f"Generated a graph with scale " + f"{scale} and deg {deg}") + subprocess.run(["python", + f"{graph_sorter}", + f"{graph_path}/graph_unordered.txt", + f"{graph_path}/graph.txt"]) + print(f"Sorted the graph here {graph_path}/graph_unordered.txt" + f" and saved in {graph_path}/graph.txt") + subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"]) + print(f"Deleted {graph_path}/graph_unordered.txt") + + if not "binaries" in os.listdir(graph_path): + print(f"binaries directory not found in {graph_path}") + os.mkdir(f"{graph_path}/binaries") + print(f"Created {graph_path}/binaries") + + if not f"gpts_{num_gpts}" in os.listdir(f"{graph_path}/binaries"): + print(f"gpts_{num_gpts} not found in {graph_path}/binaries") + os.mkdir(f"{graph_path}/binaries/gpts_{num_gpts}") + print(f"Created {graph_path}/binaries/gpts_{num_gpts}") + + expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)] + if not all([binary in os.listdir(f"{graph_path}/binaries/gpts_{num_gpts}") for binary in expected_bins]): + print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}") + for delete in os.scandir(f"{graph_path}/binaries/gpts_{num_gpts}"): + os.remove(delete.path) + print(f"Deleted all the files in {graph_path}/binaries/gpts_{num_gpts}") + subprocess.run([f"{graph_reader}" , + f"{graph_path}/graph.txt", + "false", + f"{num_gpts}", + "32", + f"{graph_path}/binaries/gpts_{num_gpts}"]) + print(f"Created the graph binaries in " + f"{graph_path}/binaries/gpts_{num_gpts}") diff --git a/configs/accl/sega.py b/configs/accl/sega.py index e8d76e7dad..10f7ea2b48 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -25,9 +25,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import m5 -import os import argparse -import subprocess from math import log from m5.objects import * @@ -49,7 +47,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size): class GPT(SubSystem): def __init__(self, edge_memory_size: str, cache_size: str): super().__init__() - self.wl_engine = WLEngine(update_queue_size=32, + self.wl_engine = WLEngine(update_queue_size=64, register_file_size=32) self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32, cache_size=cache_size, @@ -132,99 +130,19 @@ def get_inputs(): argparser = argparse.ArgumentParser() argparser.add_argument("num_gpts", type=int) argparser.add_argument("cache_size", type=str) - argparser.add_argument("vertex_cache_line_size", type=int) - argparser.add_argument("synthetic", type=bool) - argparser.add_argument("--scale", type=int) - argparser.add_argument("--deg", type=int) - argparser.add_argument("--graph", type=str) + argparser.add_argument("graph", type=str) argparser.add_argument("init_addr", type=int) argparser.add_argument("init_value", type=int) args = argparser.parse_args() - if args.synthetic: - if (args.scale is None) or (args.deg is None): - raise ValueError("If synthetic is true, you should specify the" - "scale of the graph by --scale [scale] and the average" - "degree of the graph by --deg [average degree].") - else: - if args.graph is None: - raise ValueError("If synthetic is false, you should specify the " - "path to graph binaries by --graph [path to graph].") - return args + return args.num_gpts, args.cache_size, \ + args.graph, args.init_addr, args.init_value if __name__ == "__m5_main__": - input_args = get_inputs() - - image_path = None - if input_args.synthetic: - base_dir = os.environ.get("GRAPH_DIR", default="/tmp") - graph_gen = os.environ.get("GRAPH_GEN") - graph_reader = os.environ.get("GRAPH_READER") - graph_sorter = os.environ.get("GRAPH_SORTER") - if graph_gen is None: - raise ValueError(f"No value for $GRAPH_GEN.") - if graph_reader is None: - raise ValueError(f"No value for $GRAPH_READER.") - if graph_sorter is None: - raise ValueError(f"No value for $GRAPH_SORTER") - - graph_path = os.path.join(base_dir, f"graph_{input_args.scale}_{input_args.deg}") - if not os.path.exists(graph_path): - print(f"{graph_path} does not exist already.") - os.mkdir(graph_path) - print(f"Created {graph_path}") - - if not "graph.txt" in os.listdir(graph_path): - print(f"graph.txt not found in {graph_path}") - subprocess.run([f"{graph_gen}", - f"{input_args.scale}", - f"{input_args.deg}", - f"{graph_path}/graph_unordered.txt"]) - print(f"Generated a graph with scale " - f"{input_args.scale} and deg {input_args.deg}") - subprocess.run(["python", - f"{graph_sorter}", - f"{graph_path}/graph_unordered.txt", - f"{graph_path}/graph.txt"]) - print(f"Sorted the graph here {graph_path}/graph_unordered.txt" - f" and saved in {graph_path}/graph.txt") - subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"]) - print(f"Deleted {graph_path}/graph_unordered.txt") - - if not "binaries" in os.listdir(graph_path): - print(f"binaries directory not found in {graph_path}") - os.mkdir(f"{graph_path}/binaries") - print(f"Created {graph_path}/binaries") - - if not f"gpts_{input_args.num_gpts}" in os.listdir(f"{graph_path}/binaries"): - print(f"gpts_{input_args.num_gpts} not found in {graph_path}/binaries") - os.mkdir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}") - print(f"Created {graph_path}/binaries/gpts_{input_args.num_gpts}") - - expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(input_args.num_gpts)] - if not all([binary in os.listdir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}") for binary in expected_bins]): - print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{input_args.num_gpts}") - for delete in os.scandir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}"): - os.remove(delete.path) - print(f"Deleted all the files in {graph_path}/binaries/gpts_{input_args.num_gpts}") - subprocess.run([f"{graph_reader}" , - f"{graph_path}/graph.txt", - "false", - f"{input_args.num_gpts}", - f"{input_args.vertex_cache_line_size}", - f"{graph_path}/binaries/gpts_{input_args.num_gpts}"]) - print(f"Created the graph binaries in " - f"{graph_path}/binaries/gpts_{input_args.num_gpts}") - image_path = f"{graph_path}/binaries/gpts_{input_args.num_gpts}" - else: - image_path = input_args.graph - - system = SEGA(input_args.num_gpts, - input_args.cache_size, - image_path, - input_args.init_addr, - input_args.init_value) + num_gpts, cache_size, graph, init_addr, init_value = get_inputs() + + system = SEGA(num_gpts, cache_size, graph, init_addr, init_value) root = Root(full_system = False, system = system) m5.instantiate() From 6124b008976c8797d0b330815f9b04579abf42ce Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 12 Sep 2022 15:25:11 -0700 Subject: [PATCH 161/247] Adding new stats. --- src/accl/graph/sega/coalesce_engine.cc | 13 ++++++++++++- src/accl/graph/sega/coalesce_engine.hh | 2 ++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index dbe5e56f2d..7646ba8862 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -834,9 +834,13 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) peerMemoryAtomSize); DPRINTF(CoalesceEngine, "%s: Created a read packet. addr = %lu, " "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); - memPort.sendPacket(pkt); onTheFlyReqs++; + + if (pendingVertexPullReads.find(pkt->getAddr()) != + pendingVertexPullReads.end()) { + stats.numDoubleMemReads++; + } } void @@ -1000,6 +1004,8 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) pendingVertexPullReads[addr] = send_mask; } numPullsReceived--; + } else { + stats.workSearchFails++; } if (numPullsReceived > 0) { memoryFunctionQueue.emplace_back( @@ -1061,6 +1067,11 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "Number of cache rejections caused by entry shortage."), ADD_STAT(mshrTargetShortage, statistics::units::Count::get(), "Number of cache rejections caused by target shortage."), + ADD_STAT(workSearchFails, statistics::units::Count::get(), + "Number of times coalesce engine fails to find work to push."), + ADD_STAT(numDoubleMemReads, statistics::units::Count::get(), + "Number of times a memory block has been read twice. " + "Once for push and once to populate the cache."), ADD_STAT(hitRate, statistics::units::Ratio::get(), "Hit rate in the cache."), ADD_STAT(mshrEntryLength, statistics::units::Count::get(), diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 16c417fc60..355eaad07d 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -160,6 +160,8 @@ class CoalesceEngine : public BaseMemoryEngine statistics::Scalar readHitUnderMisses; statistics::Scalar mshrEntryShortage; statistics::Scalar mshrTargetShortage; + statistics::Scalar workSearchFails; + statistics::Scalar numDoubleMemReads; statistics::Formula hitRate; statistics::Histogram mshrEntryLength; From 655902315cc2a07658100ebbdc568cb59523ef85 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 13 Sep 2022 21:44:54 -0700 Subject: [PATCH 162/247] Fixing sconscript style. --- src/accl/graph/base/SConscript | 6 ++--- src/accl/graph/sega/SConscript | 44 +++++++++++++++++----------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript index 0e43d1aed8..8b741abfc8 100644 --- a/src/accl/graph/base/SConscript +++ b/src/accl/graph/base/SConscript @@ -25,8 +25,8 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -Import('*') +Import("*") -SimObject('BaseReduceEngine.py') +SimObject("BaseReduceEngine.py", sim_objects=["BaseReduceEngine"]) -Source('base_reduce_engine.cc') +Source("base_reduce_engine.cc") diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index 5d48b46fba..f16d025ca2 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -25,30 +25,30 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -Import('*') +Import("*") -SimObject('BaseMemoryEngine.py') -SimObject('CenteralController.py') -SimObject('CoalesceEngine.py') -SimObject("MPU.py") -SimObject('PushEngine.py') -SimObject('WLEngine.py') +SimObject("BaseMemoryEngine.py", sim_objects=["BaseMemoryEngine"]) +SimObject("CenteralController.py", sim_objects=["CenteralController"]) +SimObject("CoalesceEngine.py", sim_objects=["CoalesceEngine"]) +SimObject("MPU.py", sim_objects=["MPU"]) +SimObject("PushEngine.py", sim_objects=["PushEngine"]) +SimObject("WLEngine.py", sim_objects=["WLEngine"]) -Source('base_memory_engine.cc') -Source('centeral_controller.cc') -Source('coalesce_engine.cc') +Source("base_memory_engine.cc") +Source("centeral_controller.cc") +Source("coalesce_engine.cc") Source("mpu.cc") -Source('push_engine.cc') -Source('wl_engine.cc') +Source("push_engine.cc") +Source("wl_engine.cc") -DebugFlag('ApplyUpdates') -DebugFlag('BaseMemoryEngine') -DebugFlag('CenteralController') -DebugFlag('CacheBlockState') -DebugFlag('CoalesceEngine') -DebugFlag('PushEngine') -DebugFlag('SEGAStructureSize') -DebugFlag('WLEngine') +DebugFlag("ApplyUpdates") +DebugFlag("BaseMemoryEngine") +DebugFlag("CenteralController") +DebugFlag("CacheBlockState") +DebugFlag("CoalesceEngine") +DebugFlag("PushEngine") +DebugFlag("SEGAStructureSize") +DebugFlag("WLEngine") -CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', - 'WLEngine', 'BaseMemoryEngine']) \ No newline at end of file +CompoundFlag("MPU", ["CoalesceEngine", "PushEngine", + "WLEngine", "BaseMemoryEngine"]) \ No newline at end of file From 489e914deb132f3b81cd0b31ff0254226aa08db9 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 15 Sep 2022 11:16:25 -0700 Subject: [PATCH 163/247] Adding stats for measuring push and pull rate. --- configs/accl/sega.py | 21 ++++++++----- src/accl/graph/sega/coalesce_engine.cc | 34 ++++++++++++++++++++- src/accl/graph/sega/coalesce_engine.hh | 41 ++++++++++++++++---------- 3 files changed, 72 insertions(+), 24 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 10f7ea2b48..2a92ee1769 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -61,8 +61,8 @@ def __init__(self, edge_memory_size: str, cache_size: str): self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2)) self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8( - range=AddrRange(edge_memory_size), - in_addr_map=False)) + range=AddrRange(edge_memory_size), + in_addr_map=False)) self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port self.push_engine.mem_port = self.edge_mem_ctrl.port @@ -92,7 +92,8 @@ def __init__(self, cache_size, graph_path, first_addr, - first_value): + first_value + ): super(SEGA, self).__init__() self.clk_domain = SrcClockDomain() self.clk_domain.clock = '1GHz' @@ -103,16 +104,20 @@ def __init__(self, self.interconnect = NoncoherentXBar(frontend_latency=1, forward_latency=1, response_latency=1, - width=64) + width=64 + ) self.ctrl = CenteralController(addr=first_addr, value=first_value, - image_file=f"{graph_path}/vertices") + image_file=f"{graph_path}/vertices" + ) + self.ctrl.req_port = self.interconnect.cpu_side_ports vertex_ranges = interleave_addresses( - AddrRange(start=0, size="4GiB"), - num_mpus, - 32) + AddrRange(start=0, size="4GiB"), + num_mpus, + 32 + ) gpts = [] for i in range(num_mpus): diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 7646ba8862..5f1e849660 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -454,6 +454,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) needsPush[it + i] = 0; _workCount--; owner->recvVertexPush(vertex_addr, items[i]); + stats.verticesPushed++; + stats.lastVertexPushTime = curTick() - stats.lastResetTick; } } pendingVertexPullReads.erase(addr); @@ -990,6 +992,8 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) _workCount--; owner->recvVertexPush( vertex_addr, cacheBlocks[block_index].items[wl_offset]); + stats.verticesPushed++; + stats.lastVertexPushTime = curTick() - stats.lastResetTick; } if (bit_status == BitStatus::IN_MEMORY) { Addr addr = location; @@ -1037,6 +1041,8 @@ CoalesceEngine::recvVertexPull() bool should_schedule = (numPullsReceived == 0); numPullsReceived++; + stats.verticesPulled++; + stats.lastVertexPullTime = curTick() - stats.lastResetTick; if (should_schedule) { memoryFunctionQueue.emplace_back( [this] (int slice_base, Tick schedule_tick) { @@ -1052,7 +1058,7 @@ CoalesceEngine::recvVertexPull() CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) : statistics::Group(&_coalesce), coalesce(_coalesce), - + lastResetTick(0), ADD_STAT(numVertexReads, statistics::units::Count::get(), "Number of memory vertecies read from cache."), ADD_STAT(numVertexWrites, statistics::units::Count::get(), @@ -1072,8 +1078,22 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) ADD_STAT(numDoubleMemReads, statistics::units::Count::get(), "Number of times a memory block has been read twice. " "Once for push and once to populate the cache."), + ADD_STAT(verticesPulled, statistics::units::Count::get(), + "Number of times a pull request has been sent by PushEngine."), + ADD_STAT(verticesPushed, statistics::units::Count::get(), + "Number of times a vertex has been pushed to the PushEngine"), + ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(), + "Time of the last pull request. (Relative to reset_stats)"), + ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(), + "Time of the last vertex push. (Relative to reset_stats)"), ADD_STAT(hitRate, statistics::units::Ratio::get(), "Hit rate in the cache."), + ADD_STAT(vertexPullBW, statistics::units::Rate::get(), + "Rate at which pull requests arrive."), + ADD_STAT(vertexPushBW, statistics::units::Rate::get(), + "Rate at which vertices are pushed."), ADD_STAT(mshrEntryLength, statistics::units::Count::get(), "Histogram on the length of the mshr entries."), ADD_STAT(bitvectorLength, statistics::units::Count::get(), @@ -1091,6 +1111,18 @@ CoalesceEngine::CoalesceStats::regStats() hitRate = (readHits + readHitUnderMisses) / (readHits + readHitUnderMisses + readMisses); + + vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime; + + vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime; +} + +void +CoalesceEngine::CoalesceStats::resetStats() +{ + statistics::Group::resetStats(); + + lastResetTick = curTick(); } } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 355eaad07d..8190478a1b 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -147,25 +147,36 @@ class CoalesceEngine : public BaseMemoryEngine struct CoalesceStats : public statistics::Group { - CoalesceStats(CoalesceEngine &coalesce); + CoalesceStats(CoalesceEngine &coalesce); - void regStats() override; + virtual void regStats() override; - CoalesceEngine &coalesce; + virtual void resetStats() override; - statistics::Scalar numVertexReads; - statistics::Scalar numVertexWrites; - statistics::Scalar readHits; - statistics::Scalar readMisses; - statistics::Scalar readHitUnderMisses; - statistics::Scalar mshrEntryShortage; - statistics::Scalar mshrTargetShortage; - statistics::Scalar workSearchFails; - statistics::Scalar numDoubleMemReads; + CoalesceEngine &coalesce; - statistics::Formula hitRate; - statistics::Histogram mshrEntryLength; - statistics::Histogram bitvectorLength; + Tick lastResetTick; + + statistics::Scalar numVertexReads; + statistics::Scalar numVertexWrites; + statistics::Scalar readHits; + statistics::Scalar readMisses; + statistics::Scalar readHitUnderMisses; + statistics::Scalar mshrEntryShortage; + statistics::Scalar mshrTargetShortage; + statistics::Scalar workSearchFails; + statistics::Scalar numDoubleMemReads; + statistics::Scalar verticesPulled; + statistics::Scalar verticesPushed; + statistics::Scalar lastVertexPullTime; + statistics::Scalar lastVertexPushTime; + + statistics::Formula hitRate; + statistics::Formula vertexPullBW; + statistics::Formula vertexPushBW; + + statistics::Histogram mshrEntryLength; + statistics::Histogram bitvectorLength; }; CoalesceStats stats; From b297c794e5c08daa6be9727b554687507594a034 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 16 Sep 2022 14:18:57 -0700 Subject: [PATCH 164/247] Added FinalAnswer debugFlag and answer printing. --- configs/accl/sega.py | 8 ++-- src/accl/graph/sega/CenteralController.py | 4 +- src/accl/graph/sega/SConscript | 2 +- src/accl/graph/sega/base_memory_engine.hh | 2 +- src/accl/graph/sega/centeral_controller.cc | 43 ++++++++++++++++++---- src/accl/graph/sega/centeral_controller.hh | 7 ++-- src/accl/graph/sega/coalesce_engine.cc | 36 ++++++++++++++---- src/accl/graph/sega/coalesce_engine.hh | 2 + src/accl/graph/sega/push_engine.hh | 2 + 9 files changed, 82 insertions(+), 24 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 2a92ee1769..7b37742cdb 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -107,9 +107,11 @@ def __init__(self, width=64 ) - self.ctrl = CenteralController(addr=first_addr, value=first_value, - image_file=f"{graph_path}/vertices" - ) + self.ctrl = CenteralController( + init_addr=first_addr, + init_value=first_value, + image_file=f"{graph_path}/vertices" + ) self.ctrl.req_port = self.interconnect.cpu_side_ports diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index 6f6b12ea2c..9bee76511d 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -39,6 +39,6 @@ class CenteralController(ClockedObject): mpu_vector = VectorParam.MPU("All mpus in the system.") - addr = Param.Addr("The addr for the initial update") - value = Param.Int("The value for the initial update") + init_addr = Param.Addr("The addr for the initial update") + init_value = Param.Int("The value for the initial update") image_file = Param.String("Path to the global memory image.") diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index f16d025ca2..5d411be9ac 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -41,11 +41,11 @@ Source("mpu.cc") Source("push_engine.cc") Source("wl_engine.cc") -DebugFlag("ApplyUpdates") DebugFlag("BaseMemoryEngine") DebugFlag("CenteralController") DebugFlag("CacheBlockState") DebugFlag("CoalesceEngine") +DebugFlag("FinalAnswer") DebugFlag("PushEngine") DebugFlag("SEGAStructureSize") DebugFlag("WLEngine") diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh index f336edcbf1..afe7fd0433 100644 --- a/src/accl/graph/sega/base_memory_engine.hh +++ b/src/accl/graph/sega/base_memory_engine.hh @@ -108,7 +108,7 @@ class BaseMemoryEngine : public ClockedObject AddrRangeList getAddrRanges() { return memPort.getAddrRanges(); } - void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } + virtual void recvFunctional(PacketPtr pkt) = 0; virtual void init() override; }; diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 5ce7228abb..c6de1d8390 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -28,8 +28,6 @@ #include "accl/graph/sega/centeral_controller.hh" -#include - #include "base/loader/memory_image.hh" #include "base/loader/object_file.hh" #include "debug/CenteralController.hh" @@ -44,8 +42,7 @@ CenteralController::CenteralController ClockedObject(params), system(params.system), reqPort(name() + ".req_port", this), - addr(params.addr), - value(params.value) + maxVertexAddr(0) { for (auto mpu : params.mpu_vector) { mpuVector.push_back(mpu); @@ -66,9 +63,9 @@ CenteralController::getPort(const std::string &if_name, PortID idx) void CenteralController::initState() { - ClockedObject::initState(); + // ClockedObject::initState(); - const auto &file = params().image_file; + const auto& file = params().image_file; if (file == "") return; @@ -77,6 +74,7 @@ CenteralController::initState() loader::debugSymbolTable.insert(*object->symtab().globals()); loader::MemoryImage image = object->buildImage(); + maxVertexAddr = image.maxAddr(); PortProxy proxy([this](PacketPtr pkt) { functionalAccess(pkt); }, system->cacheLineSize()); @@ -86,7 +84,10 @@ CenteralController::initState() void CenteralController::startup() { - PacketPtr first_update = createUpdatePacket(addr, value); + Addr initial_addr = params().init_addr; + uint32_t initial_value = params().init_value; + PacketPtr first_update = + createUpdatePacket(initial_addr, initial_value); if (!reqPort.blocked()) { reqPort.sendPacket(first_update); @@ -111,6 +112,21 @@ CenteralController::createUpdatePacket(Addr addr, T value) return pkt; } +PacketPtr +CenteralController::createReadPacket(Addr addr, unsigned int size) +{ + RequestPtr req = std::make_shared(addr, size, 0, 0); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC((Addr) 0); + + // Embed it in a packet + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + pkt->allocate(); + + return pkt; +} + void CenteralController::ReqPort::sendPacket(PacketPtr pkt) { @@ -160,6 +176,19 @@ CenteralController::recvDoneSignal() } if (done) { + for (Addr addr = 0; addr < maxVertexAddr; addr += system->cacheLineSize()) { + PacketPtr pkt = createReadPacket(addr, system->cacheLineSize()); + reqPort.sendFunctional(pkt); + + int num_items = system->cacheLineSize() / sizeof(WorkListItem); + WorkListItem items[num_items]; + pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize()); + + for (int i = 0; i < num_items; i++) { + DPRINTF(FinalAnswer, "%s: WorkListItem[%lu][%d]: %s.\n", + __func__, addr, i, items[i].to_string()); + } + } exitSimLoopNow("no update left to process."); } } diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index c54c4c04ef..bd272cf30d 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -33,6 +33,7 @@ #include "accl/graph/base/data_structs.hh" #include "accl/graph/sega/mpu.hh" +#include "debug/FinalAnswer.hh" #include "params/CenteralController.hh" #include "sim/clocked_object.hh" #include "sim/system.hh" @@ -67,12 +68,12 @@ class CenteralController : public ClockedObject System* system; ReqPort reqPort; - Addr addr; - uint32_t value; - + Addr maxVertexAddr; std::vector mpuVector; + template PacketPtr createUpdatePacket(Addr addr, T value); + PacketPtr createReadPacket(Addr addr, unsigned int size); void functionalAccess(PacketPtr pkt); public: diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 5f1e849660..59d9720148 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -32,7 +32,6 @@ #include "accl/graph/sega/mpu.hh" #include "base/intmath.hh" -#include "debug/ApplyUpdates.hh" #include "debug/CacheBlockState.hh" #include "debug/CoalesceEngine.hh" #include "debug/SEGAStructureSize.hh" @@ -75,12 +74,38 @@ CoalesceEngine::registerMPU(MPU* mpu) owner = mpu; } +void +CoalesceEngine::recvFunctional(PacketPtr pkt) +{ + if (pkt->isRead()) { + assert(pkt->getSize() == peerMemoryAtomSize); + Addr addr = pkt->getAddr(); + int block_index = getBlockIndex(addr); + + if ((cacheBlocks[block_index].addr == addr) && + (cacheBlocks[block_index].valid)) { + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].needsApply); + // NOTE: No need to check needsWB because there might be entries + // that have been updated and not written back in the cache. + // assert(!cacheBlocks[block_index].needsWB); + assert(!cacheBlocks[block_index].pendingApply); + assert(!cacheBlocks[block_index].pendingWB); + + pkt->makeResponse(); + pkt->setDataFromBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + } else { + memPort.sendFunctional(pkt); + } + } else { + memPort.sendFunctional(pkt); + } +} + bool CoalesceEngine::done() { - bool push_none = needsPush.none(); - DPRINTF(CoalesceEngine, "%s: needsPush.none: %s.\n", - __func__, push_none ? "true" : "false"); return applyQueue.empty() && needsPush.none() && memoryFunctionQueue.empty() && (onTheFlyReqs == 0); } @@ -723,9 +748,6 @@ CoalesceEngine::processNextApplyEvent() if (new_prop != current_prop) { cacheBlocks[block_index].items[index].tempProp = new_prop; cacheBlocks[block_index].items[index].prop = new_prop; - DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu][%d]: %s.\n", - __func__, cacheBlocks[block_index].addr, index, - cacheBlocks[block_index].items[index].to_string()); int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr); diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 8190478a1b..bb6fd9d1ea 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -190,6 +190,8 @@ class CoalesceEngine : public BaseMemoryEngine CoalesceEngine(const Params ¶ms); void registerMPU(MPU* mpu); + virtual void recvFunctional(PacketPtr pkt); + bool recvWLRead(Addr addr); void recvWLWrite(Addr addr, WorkListItem wl); diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index a5677067b8..b317992b2d 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -142,6 +142,8 @@ class PushEngine : public BaseMemoryEngine PushEngine(const Params& params); void registerMPU(MPU* mpu); + virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } + void start(); bool running() { return _running; } void recvVertexPush(Addr addr, WorkListItem wl); From 16216bc2bf3dee723fa35eccd478412e47bfe738 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 18 Sep 2022 17:17:24 -0700 Subject: [PATCH 165/247] Adding stats to measure vertexReadLatency. --- src/accl/graph/sega/coalesce_engine.cc | 5 ++++- src/accl/graph/sega/coalesce_engine.hh | 1 + src/accl/graph/sega/wl_engine.cc | 14 ++++++++++++-- src/accl/graph/sega/wl_engine.hh | 5 +++-- 4 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 59d9720148..d4102a8bca 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -810,6 +810,7 @@ void CoalesceEngine::processNextMemoryEvent() { if (memPort.blocked()) { + stats.numMemoryBlocks++; nextMemoryEvent.sleep(); return; } @@ -1097,6 +1098,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "Number of cache rejections caused by target shortage."), ADD_STAT(workSearchFails, statistics::units::Count::get(), "Number of times coalesce engine fails to find work to push."), + ADD_STAT(numMemoryBlocks, statistics::units::Count::get(), + "Number of times memory bandwidth was not available."), ADD_STAT(numDoubleMemReads, statistics::units::Count::get(), "Number of times a memory block has been read twice. " "Once for push and once to populate the cache."), @@ -1147,4 +1150,4 @@ CoalesceEngine::CoalesceStats::resetStats() lastResetTick = curTick(); } -} +} // namespace gem5 diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index bb6fd9d1ea..967d83a531 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -165,6 +165,7 @@ class CoalesceEngine : public BaseMemoryEngine statistics::Scalar mshrEntryShortage; statistics::Scalar mshrTargetShortage; statistics::Scalar workSearchFails; + statistics::Scalar numMemoryBlocks; statistics::Scalar numDoubleMemReads; statistics::Scalar verticesPulled; statistics::Scalar verticesPushed; diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index a39905037e..b16d827dbe 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -143,6 +143,7 @@ WLEngine::processNextReadEvent() "updateQueueSize = %d.\n", __func__, update_addr, update_value, updateQueue.size(), updateQueueSize); owner->checkRetryReq(); + vertexReadTime[update_addr] = curTick(); } } else { DPRINTF(WLEngine, "%s: There are no free registers " @@ -189,6 +190,11 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl) DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to " "workListFile. workListFile.size = %d.\n", __func__, addr, wl.to_string(), workListFile.size()); + + stats.vertexReadLatency.sample( + (curTick() - vertexReadTime[addr]) / getClockFrequency()); + vertexReadTime.erase(addr); + assert(!workListFile.empty()); if (!nextReduceEvent.scheduled()) { schedule(nextReduceEvent, nextCycle()); @@ -238,7 +244,9 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl) "Number of memory blocks read for vertecies"), ADD_STAT(registerShortage, statistics::units::Count::get(), "Number of times updates were " - "stalled because of register shortage") + "stalled because of register shortage"), + ADD_STAT(vertexReadLatency, statistics::units::Second::get(), + "Histogram of the latency of reading a vertex.") { } @@ -246,6 +254,8 @@ void WLEngine::WorkListStats::regStats() { using namespace statistics; -} + vertexReadLatency.init(64); } + +} // namespace gem5 diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 2956e58666..0c6361825e 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -47,13 +47,12 @@ class WLEngine : public BaseReduceEngine private: MPU* owner; - - int updateQueueSize; std::deque> updateQueue; int registerFileSize; std::unordered_map registerFile; + std::unordered_map vertexReadTime; std::unordered_map workListFile; @@ -77,6 +76,8 @@ class WLEngine : public BaseReduceEngine statistics::Scalar numReduce; statistics::Scalar registerFileCoalesce; statistics::Scalar registerShortage; + + statistics::Histogram vertexReadLatency; }; WorkListStats stats; From 3e6216c8976155517cb9edb2874ca7c890b56255 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 19 Sep 2022 11:56:05 -0700 Subject: [PATCH 166/247] Adding a config script with simple memory --- configs/accl/sega-simple.py | 177 ++++++++++++++++++++++++++++++++++++ configs/accl/sega.py | 48 ++++++---- 2 files changed, 206 insertions(+), 19 deletions(-) create mode 100644 configs/accl/sega-simple.py diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py new file mode 100644 index 0000000000..ae537e76ca --- /dev/null +++ b/configs/accl/sega-simple.py @@ -0,0 +1,177 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import m5 +import argparse + +from math import log +from m5.objects import * + +def interleave_addresses(plain_range, num_channels, cache_line_size): + intlv_low_bit = log(cache_line_size, 2) + intlv_bits = log(num_channels, 2) + ret = [] + for i in range(num_channels): + ret.append(AddrRange( + start=plain_range.start, + size=plain_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + xorHighBit=0, + intlvBits=intlv_bits, + intlvMatch=i)) + return ret + +class GPT(SubSystem): + def __init__(self, edge_memory_size: str, cache_size: str): + super().__init__() + self.wl_engine = WLEngine( + update_queue_size=64, + register_file_size=32 + ) + self.coalesce_engine = CoalesceEngine( + attached_memory_atom_size=32, + cache_size=cache_size, + num_mshr_entry=32, + num_tgts_per_mshr=32, + max_resp_per_cycle=4 + ) + self.push_engine = PushEngine( + push_req_queue_size=32, + attached_memory_atom_size=64, + resp_queue_size=64 + ) + + self.vertex_mem_ctrl = SimpleMemory( + latency="75ns", + latency_var="0ns", + bandwidth="19.2GB/s" + ) + + self.edge_mem_ctrl = SimpleMemory( + latency="75ns", + latency_var="0ns", + bandwidth="19.2GB/s", + range=AddrRange(edge_memory_size), + in_addr_map=False + ) + + self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port + self.push_engine.mem_port = self.edge_mem_ctrl.port + + self.mpu = MPU( + wl_engine=self.wl_engine, + coalesce_engine=self.coalesce_engine, + push_engine=self.push_engine + ) + + def getRespPort(self): + return self.mpu.in_port + def setRespPort(self, port): + self.mpu.in_port = port + + def getReqPort(self): + return self.mpu.out_port + def setReqPort(self, port): + self.mpu.out_port = port + + def set_vertex_range(self, vertex_range): + self.vertex_mem_ctrl.range = vertex_range + def set_edge_image(self, edge_image): + self.edge_mem_ctrl.image_file = edge_image + +class SEGA(System): + def __init__( + self, + num_mpus, + cache_size, + graph_path, + first_addr, + first_value + ): + super(SEGA, self).__init__() + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = '1GHz' + self.clk_domain.voltage_domain = VoltageDomain() + self.cache_line_size = 32 + self.mem_mode = "timing" + + self.interconnect = NoncoherentXBar( + frontend_latency=1, + forward_latency=1, + response_latency=1, + width=64 + ) + + self.ctrl = CenteralController( + init_addr=first_addr, + init_value=first_value, + image_file=f"{graph_path}/vertices" + ) + + self.ctrl.req_port = self.interconnect.cpu_side_ports + + vertex_ranges = interleave_addresses( + AddrRange(start=0, size="4GiB"), + num_mpus, + 32 + ) + + gpts = [] + for i in range(num_mpus): + gpt = GPT("8GiB", cache_size) + gpt.set_vertex_range(vertex_ranges[i]) + gpt.set_edge_image(f"{graph_path}/edgelist_{i}") + gpt.setReqPort(self.interconnect.cpu_side_ports) + gpt.setRespPort(self.interconnect.mem_side_ports) + gpts.append(gpt) + self.gpts = gpts + + self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) + + args = argparser.parse_args() + + return args.num_gpts, args.cache_size, \ + args.graph, args.init_addr, args.init_value + +if __name__ == "__m5_main__": + num_gpts, cache_size, graph, init_addr, init_value = get_inputs() + + system = SEGA(num_gpts, cache_size, graph, init_addr, init_value) + root = Root(full_system = False, system = system) + + m5.instantiate() + + exit_event = m5.simulate() + print(f"Exited simulation at tick {m5.curTick()} " + \ + f"because {exit_event.getCause()}") diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 7b37742cdb..8c30d10dec 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -47,29 +47,39 @@ def interleave_addresses(plain_range, num_channels, cache_line_size): class GPT(SubSystem): def __init__(self, edge_memory_size: str, cache_size: str): super().__init__() - self.wl_engine = WLEngine(update_queue_size=64, - register_file_size=32) - self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32, + self.wl_engine = WLEngine( + update_queue_size=64, + register_file_size=32 + ) + self.coalesce_engine = CoalesceEngine( + attached_memory_atom_size=32, cache_size=cache_size, num_mshr_entry=32, num_tgts_per_mshr=32, - max_resp_per_cycle=4) - self.push_engine = PushEngine(push_req_queue_size=32, + max_resp_per_cycle=4 + ) + self.push_engine = PushEngine( + push_req_queue_size=32, attached_memory_atom_size=64, - resp_queue_size=64) + resp_queue_size=64 + ) self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2)) self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8( - range=AddrRange(edge_memory_size), - in_addr_map=False)) + range=AddrRange(edge_memory_size), + in_addr_map=False + ) + ) self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port self.push_engine.mem_port = self.edge_mem_ctrl.port - self.mpu = MPU(wl_engine=self.wl_engine, + self.mpu = MPU( + wl_engine=self.wl_engine, coalesce_engine=self.coalesce_engine, - push_engine=self.push_engine) + push_engine=self.push_engine + ) def getRespPort(self): return self.mpu.in_port @@ -87,7 +97,8 @@ def set_edge_image(self, edge_image): self.edge_mem_ctrl.dram.image_file = edge_image class SEGA(System): - def __init__(self, + def __init__( + self, num_mpus, cache_size, graph_path, @@ -101,25 +112,24 @@ def __init__(self, self.cache_line_size = 32 self.mem_mode = "timing" - self.interconnect = NoncoherentXBar(frontend_latency=1, + self.interconnect = NoncoherentXBar( + frontend_latency=1, forward_latency=1, response_latency=1, width=64 ) self.ctrl = CenteralController( - init_addr=first_addr, - init_value=first_value, + addr=first_addr, value=first_value, image_file=f"{graph_path}/vertices" ) - self.ctrl.req_port = self.interconnect.cpu_side_ports vertex_ranges = interleave_addresses( - AddrRange(start=0, size="4GiB"), - num_mpus, - 32 - ) + AddrRange(start=0, size="4GiB"), + num_mpus, + 32 + ) gpts = [] for i in range(num_mpus): From e1d8a934fdbb80520c46e18a136a271ac676d255 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Mon, 19 Sep 2022 20:27:40 -0700 Subject: [PATCH 167/247] Adding stats to count the result of bitvector search. --- src/accl/graph/sega/coalesce_engine.cc | 12 +++++++----- src/accl/graph/sega/coalesce_engine.hh | 3 ++- src/accl/graph/sega/push_engine.cc | 3 +++ src/accl/graph/sega/push_engine.hh | 1 + src/accl/graph/sega/wl_engine.cc | 2 +- 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index d4102a8bca..b870345d57 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -1031,9 +1031,10 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) pendingVertexPullReads[addr] = send_mask; } numPullsReceived--; - } else { - stats.workSearchFails++; } + + stats.bitvectorSearchStatus[bit_status]++; + if (numPullsReceived > 0) { memoryFunctionQueue.emplace_back( [this] (int slice_base, Tick schedule_tick) { @@ -1096,8 +1097,6 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "Number of cache rejections caused by entry shortage."), ADD_STAT(mshrTargetShortage, statistics::units::Count::get(), "Number of cache rejections caused by target shortage."), - ADD_STAT(workSearchFails, statistics::units::Count::get(), - "Number of times coalesce engine fails to find work to push."), ADD_STAT(numMemoryBlocks, statistics::units::Count::get(), "Number of times memory bandwidth was not available."), ADD_STAT(numDoubleMemReads, statistics::units::Count::get(), @@ -1111,6 +1110,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "Time of the last pull request. (Relative to reset_stats)"), ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(), "Time of the last vertex push. (Relative to reset_stats)"), + ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(), + "Distribution for the location of vertex searches."), ADD_STAT(hitRate, statistics::units::Ratio::get(), "Hit rate in the cache."), ADD_STAT(vertexPullBW, statistics::units::Rateblocked()) { + stats.numNetBlocks++; nextPushEvent.sleep(); return; } @@ -301,6 +302,8 @@ PushEngine::PushStats::PushStats(PushEngine &_push) push(_push), ADD_STAT(numUpdates, statistics::units::Count::get(), "Number of sent updates."), + ADD_STAT(numNetBlocks, statistics::units::Count::get(), + "Number of updates blocked by network."), ADD_STAT(TEPS, statistics::units::Rate::get(), "Traversed Edges Per Second.") diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index b317992b2d..801d8e567d 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -127,6 +127,7 @@ class PushEngine : public BaseMemoryEngine PushEngine &push; statistics::Scalar numUpdates; + statistics::Scalar numNetBlocks; statistics::Formula TEPS; }; diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index b16d827dbe..c6e8fda523 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -192,7 +192,7 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl) wl.to_string(), workListFile.size()); stats.vertexReadLatency.sample( - (curTick() - vertexReadTime[addr]) / getClockFrequency()); + ((curTick() - vertexReadTime[addr]) * 1e9) / getClockFrequency()); vertexReadTime.erase(addr); assert(!workListFile.empty()); From a0a0fbeaa85a09ee2545adfaedfc251de483b6fd Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 22 Sep 2022 12:21:46 -0700 Subject: [PATCH 168/247] Adding a stat to count number of idle cycles. --- src/accl/graph/sega/push_engine.cc | 6 +++++- src/accl/graph/sega/push_engine.hh | 6 +++--- src/accl/graph/sega/wl_engine.hh | 1 + 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index a56283cbf6..5029013acd 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -28,7 +28,6 @@ #include "accl/graph/sega/push_engine.hh" -#include "accl/graph/sega/coalesce_engine.hh" #include "accl/graph/sega/mpu.hh" #include "debug/PushEngine.hh" #include "mem/packet_access.hh" @@ -40,6 +39,7 @@ namespace gem5 PushEngine::PushEngine(const Params& params): BaseMemoryEngine(params), _running(false), + lastIdleEntranceTick(0), numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size), onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size), workload(params.workload), @@ -107,6 +107,7 @@ PushEngine::start() assert(!nextVertexPullEvent.scheduled()); _running = true; + stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick); // NOTE: We might have to check for size availability here. assert(workLeft()); if (vertexSpace()) { @@ -123,6 +124,7 @@ PushEngine::processNextVertexPullEvent() if (!workLeft()) { _running = false; + lastIdleEntranceTick = curTick(); } if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) { @@ -304,6 +306,8 @@ PushEngine::PushStats::PushStats(PushEngine &_push) "Number of sent updates."), ADD_STAT(numNetBlocks, statistics::units::Count::get(), "Number of updates blocked by network."), + ADD_STAT(numIdleCycles, statistics::units::Count::get(), + "Number of cycles PushEngine has been idle."), ADD_STAT(TEPS, statistics::units::Rate::get(), "Traversed Edges Per Second.") diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 801d8e567d..1f139d061e 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -37,7 +37,6 @@ namespace gem5 { -class CoalesceEngine; class MPU; class PushEngine : public BaseMemoryEngine @@ -88,10 +87,10 @@ class PushEngine : public BaseMemoryEngine Addr offset; int numElements; }; + MPU* owner; bool _running; - int numElementsPerLine; - MPU* owner; + Tick lastIdleEntranceTick; int numPendingPulls; int edgePointerQueueSize; @@ -128,6 +127,7 @@ class PushEngine : public BaseMemoryEngine statistics::Scalar numUpdates; statistics::Scalar numNetBlocks; + statistics::Scalar numIdleCycles; statistics::Formula TEPS; }; diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 0c6361825e..3d527df3cf 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -77,6 +77,7 @@ class WLEngine : public BaseReduceEngine statistics::Scalar registerFileCoalesce; statistics::Scalar registerShortage; + statistics::Histogram vertexReadLatency; }; From efcc6d2b35a03dcaa078f5c95d91ef6028c7805b Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 22 Sep 2022 17:32:46 -0700 Subject: [PATCH 169/247] Adding stats to measure queueing latencies. --- configs/accl/sega.py | 3 ++- src/accl/graph/base/data_structs.hh | 6 ++++-- src/accl/graph/sega/coalesce_engine.cc | 17 +++++++++++++++-- src/accl/graph/sega/coalesce_engine.hh | 5 ++++- src/accl/graph/sega/push_engine.cc | 25 +++++++++++++++++++------ src/accl/graph/sega/push_engine.hh | 12 +++++++++--- src/accl/graph/sega/wl_engine.cc | 12 +++++++++--- src/accl/graph/sega/wl_engine.hh | 4 ++-- 8 files changed, 64 insertions(+), 20 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 8c30d10dec..a67551a5fd 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -120,7 +120,8 @@ def __init__( ) self.ctrl = CenteralController( - addr=first_addr, value=first_value, + init_addr=first_addr, + init_value=first_value, image_file=f"{graph_path}/vertices" ) self.ctrl.req_port = self.interconnect.cpu_side_ports diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index 026a3cb7b2..a46aaf2de9 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -96,8 +96,10 @@ struct CompleteEdge { uint32_t weight; uint32_t value; - CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value): - src(src), dst(dst), weight(weight), value(value) + uint64_t entrance; + + CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value, uint64_t entrance): + src(src), dst(dst), weight(weight), value(value), entrance(entrance) {} std::string to_string() diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index b870345d57..62cae01613 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -826,6 +826,8 @@ CoalesceEngine::processNextMemoryEvent() next_memory_function_tick) = memoryFunctionQueue.front(); next_memory_function(next_memory_function_input, next_memory_function_tick); memoryFunctionQueue.pop_front(); + stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick) + * 1e9 / getClockFrequency()); DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. " "memoryFunctionQueue.size = %d.\n", __func__, memoryFunctionQueue.size()); @@ -929,6 +931,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) "the current write back scheduled at tick %lu for " "the right function scheduled later.\n", __func__, block_index, schedule_tick); + stats.numInvalidMemFunctions++; } } @@ -1110,6 +1113,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "Time of the last pull request. (Relative to reset_stats)"), ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(), "Time of the last vertex push. (Relative to reset_stats)"), + ADD_STAT(numInvalidMemFunctions, statistics::units::Count::get(), + "Number of times a scheduled memory function has been invalid."), ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(), "Distribution for the location of vertex searches."), ADD_STAT(hitRate, statistics::units::Ratio::get(), @@ -1123,7 +1128,9 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) ADD_STAT(mshrEntryLength, statistics::units::Count::get(), "Histogram on the length of the mshr entries."), ADD_STAT(bitvectorLength, statistics::units::Count::get(), - "Histogram of the length of the bitvector.") + "Histogram of the length of the bitvector."), + ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(), + "Histogram of the latency of processing a memory function.") { } @@ -1134,7 +1141,11 @@ CoalesceEngine::CoalesceStats::regStats() mshrEntryLength.init(coalesce.params().num_tgts_per_mshr); bitvectorLength.init(64); - bitvectorSearchStatus.init(4); + bitvectorSearchStatus.init(NUM_STATUS); + bitvectorSearchStatus.subname(0, "PENDING_READ"); + bitvectorSearchStatus.subname(1, "IN_CACHE"); + bitvectorSearchStatus.subname(2, "IN_MEMORY"); + bitvectorSearchStatus.subname(3, "GARBAGE"); hitRate = (readHits + readHitUnderMisses) / (readHits + readHitUnderMisses + readMisses); @@ -1142,6 +1153,8 @@ CoalesceEngine::CoalesceStats::regStats() vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime; vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime; + + memoryFunctionLatency.init(64); } void diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 2b7b17d196..262f75fbcf 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -47,7 +47,8 @@ enum BitStatus PENDING_READ, IN_CACHE, IN_MEMORY, - GARBAGE + GARBAGE, + NUM_STATUS }; class MPU; @@ -170,6 +171,7 @@ class CoalesceEngine : public BaseMemoryEngine statistics::Scalar verticesPushed; statistics::Scalar lastVertexPullTime; statistics::Scalar lastVertexPushTime; + statistics::Scalar numInvalidMemFunctions; statistics::Vector bitvectorSearchStatus; @@ -179,6 +181,7 @@ class CoalesceEngine : public BaseMemoryEngine statistics::Histogram mshrEntryLength; statistics::Histogram bitvectorLength; + statistics::Histogram memoryFunctionLatency; }; CoalesceStats stats; diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 5029013acd..af1c904eda 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -142,8 +142,10 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl) Addr start_addr = wl.edgeIndex * sizeof(Edge); Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); - edgePointerQueue.emplace_back(start_addr, end_addr, sizeof(Edge), - peerMemoryAtomSize, addr, (uint32_t) wl.prop); + edgePointerQueue.emplace_back( + start_addr, end_addr, sizeof(Edge), + peerMemoryAtomSize, addr, + (uint32_t) wl.prop, curTick()); numPendingPulls--; if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) { schedule(nextVertexPullEvent, nextCycle()); @@ -182,6 +184,9 @@ PushEngine::processNextMemoryReadEvent() if (curr_info.done()) { DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__); + stats.edgePointerQueueLatency.sample( + (curTick() - curr_info.entrance()) * + 1e9 / getClockFrequency()); edgePointerQueue.pop_front(); DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. " "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size()); @@ -224,8 +229,8 @@ PushEngine::handleMemResp(PacketPtr pkt) Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge)); Addr edge_dst = edge->neighbor; uint32_t edge_weight = edge->weight; - edges.emplace_back(push_info.src, edge_dst, - edge_weight, push_info.value); + edges.emplace_back( + push_info.src, edge_dst, edge_weight, push_info.value, curTick()); } edgeQueue.push_back(edges); onTheFlyMemReqs--; @@ -267,7 +272,8 @@ PushEngine::processNextPushEvent() "with value: %d.\n", __func__, curr_edge.src, curr_edge.dst, update_value); - + stats.edgeQueueLatency.sample( + (curTick() - curr_edge.entrance) * 1e9 / getClockFrequency()); edge_list.pop_front(); if (edge_list.empty()) { edgeQueue.pop_front(); @@ -310,7 +316,11 @@ PushEngine::PushStats::PushStats(PushEngine &_push) "Number of cycles PushEngine has been idle."), ADD_STAT(TEPS, statistics::units::Rate::get(), - "Traversed Edges Per Second.") + "Traversed Edges Per Second."), + ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(), + "Histogram of the latency of the edgePointerQueue."), + ADD_STAT(edgeQueueLatency, statistics::units::Second::get(), + "Histogram of the latency of the edgeQueue.") { } @@ -320,6 +330,9 @@ PushEngine::PushStats::regStats() using namespace statistics; TEPS = numUpdates / simSeconds; + + edgePointerQueueLatency.init(64); + edgeQueueLatency.init(64); } } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 1f139d061e..5d2277eb5a 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -52,11 +52,12 @@ class PushEngine : public BaseMemoryEngine Addr _src; uint32_t _value; + Tick _entrance; public: EdgeReadInfoGen(Addr start, Addr end, size_t step, - size_t atom, Addr src, uint32_t value): - _start(start), _end(end), _step(step), - _atom(atom), _src(src), _value(value) + size_t atom, Addr src, uint32_t value, Tick entrance): + _start(start), _end(end), _step(step), _atom(atom), + _src(src), _value(value), _entrance(entrance) {} std::tuple nextReadPacketInfo() @@ -80,6 +81,8 @@ class PushEngine : public BaseMemoryEngine Addr src() { return _src; } uint32_t value() { return _value; } + + Tick entrance() { return _entrance; } }; struct PushInfo { Addr src; @@ -130,6 +133,9 @@ class PushEngine : public BaseMemoryEngine statistics::Scalar numIdleCycles; statistics::Formula TEPS; + + statistics::Histogram edgePointerQueueLatency; + statistics::Histogram edgeQueueLatency; }; PushStats stats; diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index c6e8fda523..5d4dd1723e 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -79,7 +79,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt) return false; } - updateQueue.emplace_back(pkt->getAddr(), pkt->getLE()); + updateQueue.emplace_back(pkt->getAddr(), pkt->getLE(), curTick()); DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the " "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n", __func__, pkt->getAddr(), pkt->getLE(), @@ -105,7 +105,8 @@ WLEngine::processNextReadEvent() { Addr update_addr; uint32_t update_value; - std::tie(update_addr, update_value) = updateQueue.front(); + Tick enter_tick; + std::tie(update_addr, update_value, enter_tick) = updateQueue.front(); DPRINTF(WLEngine, "%s: Looking at the front of the updateQueue. " "(addr: %lu, value: %u).\n", __func__, update_addr, update_value); @@ -134,6 +135,7 @@ WLEngine::processNextReadEvent() "registerFileSize = %d.\n", __func__, update_addr, update_value, registerFile.size(), registerFileSize); updateQueue.pop_front(); + stats.updateQueueLatency.sample((curTick() - enter_tick) * 1e9 / getClockFrequency()); DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) " "from updateQueue. updateQueue.size = %d. " "updateQueueSize = %d.\n", __func__, update_addr, @@ -162,6 +164,7 @@ WLEngine::processNextReadEvent() update_value, update_addr, registerFile[update_addr]); stats.registerFileCoalesce++; updateQueue.pop_front(); + stats.updateQueueLatency.sample((curTick() - enter_tick) * 1e9 / getClockFrequency()); DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) " "from updateQueue. updateQueue.size = %d. " "updateQueueSize = %d.\n", __func__, update_addr, @@ -246,7 +249,9 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl) "Number of times updates were " "stalled because of register shortage"), ADD_STAT(vertexReadLatency, statistics::units::Second::get(), - "Histogram of the latency of reading a vertex.") + "Histogram of the latency of reading a vertex (ns)."), + ADD_STAT(updateQueueLatency, statistics::units::Second::get(), + "Histogram of the latency of dequeuing an update (ns).") { } @@ -256,6 +261,7 @@ WLEngine::WorkListStats::regStats() using namespace statistics; vertexReadLatency.init(64); + updateQueueLatency.init(64); } } // namespace gem5 diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 3d527df3cf..f888979be9 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -48,7 +48,7 @@ class WLEngine : public BaseReduceEngine MPU* owner; int updateQueueSize; - std::deque> updateQueue; + std::deque> updateQueue; int registerFileSize; std::unordered_map registerFile; @@ -77,8 +77,8 @@ class WLEngine : public BaseReduceEngine statistics::Scalar registerFileCoalesce; statistics::Scalar registerShortage; - statistics::Histogram vertexReadLatency; + statistics::Histogram updateQueueLatency; }; WorkListStats stats; From baa1dcb8df2e4d09a05ed6b97fd1b36c24f92e74 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 26 Sep 2022 10:52:35 -0700 Subject: [PATCH 170/247] Added pybindmethod to createInitialUpdate. merge added. --- configs/accl/sega-simple.py | 8 +- configs/accl/sega-single-simple.py | 151 ++++++++++++++++++++ configs/accl/sega-single.py | 155 +++++++++++++++++++++ src/accl/graph/sega/CenteralController.py | 8 +- src/accl/graph/sega/MPU.py | 1 + src/accl/graph/sega/base_memory_engine.cc | 20 +-- src/accl/graph/sega/centeral_controller.cc | 131 +++++------------ src/accl/graph/sega/centeral_controller.hh | 39 ++---- src/accl/graph/sega/coalesce_engine.cc | 27 ++++ src/base/addr_range.hh | 31 +++++ 10 files changed, 430 insertions(+), 141 deletions(-) create mode 100644 configs/accl/sega-single-simple.py create mode 100644 configs/accl/sega-single.py diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py index ae537e76ca..e0a4fcc89e 100644 --- a/configs/accl/sega-simple.py +++ b/configs/accl/sega-simple.py @@ -65,15 +65,15 @@ def __init__(self, edge_memory_size: str, cache_size: str): ) self.vertex_mem_ctrl = SimpleMemory( - latency="75ns", + latency="0ns", latency_var="0ns", - bandwidth="19.2GB/s" + bandwidth="0GB/s" ) self.edge_mem_ctrl = SimpleMemory( - latency="75ns", + latency="30ns", latency_var="0ns", - bandwidth="19.2GB/s", + bandwidth="32GB/s", range=AddrRange(edge_memory_size), in_addr_map=False ) diff --git a/configs/accl/sega-single-simple.py b/configs/accl/sega-single-simple.py new file mode 100644 index 0000000000..a87e6c53bb --- /dev/null +++ b/configs/accl/sega-single-simple.py @@ -0,0 +1,151 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import m5 +import argparse + +from math import log +from m5.objects import * + +def interleave_addresses(plain_range, num_channels, cache_line_size): + intlv_low_bit = log(cache_line_size, 2) + intlv_bits = log(num_channels, 2) + ret = [] + for i in range(num_channels): + ret.append(AddrRange( + start=plain_range.start, + size=plain_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + xorHighBit=0, + intlvBits=intlv_bits, + intlvMatch=i)) + return ret + +class GPT(SubSystem): + def __init__(self, edge_memory_size: str, cache_size: str): + super().__init__() + self.wl_engine = WLEngine( + update_queue_size=64, + register_file_size=32 + ) + self.coalesce_engine = CoalesceEngine( + attached_memory_atom_size=32, + cache_size=cache_size, + num_mshr_entry=32, + num_tgts_per_mshr=32, + max_resp_per_cycle=4 + ) + self.push_engine = PushEngine( + push_req_queue_size=32, + attached_memory_atom_size=64, + resp_queue_size=64 + ) + + self.vertex_mem_ctrl = SimpleMemory( + latency="30ns", + latency_var="0ns", + bandwidth="0GB/s" + ) + + self.edge_mem_ctrl = SimpleMemory( + latency="30ns", + latency_var="0ns", + bandwidth="32GB/s", + range=AddrRange(edge_memory_size), + in_addr_map=False + ) + + self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port + self.push_engine.mem_port = self.edge_mem_ctrl.port + + self.mpu = MPU( + wl_engine=self.wl_engine, + coalesce_engine=self.coalesce_engine, + push_engine=self.push_engine + ) + + def getRespPort(self): + return self.mpu.in_port + def setRespPort(self, port): + self.mpu.in_port = port + + def getReqPort(self): + return self.mpu.out_port + def setReqPort(self, port): + self.mpu.out_port = port + + def set_vertex_range(self, vertex_range): + self.vertex_mem_ctrl.range = vertex_range + + def set_vertex_image(self, vertex_image): + self.vertex_mem_ctrl.image_file = vertex_image + def set_edge_image(self, edge_image): + self.edge_mem_ctrl.image_file = edge_image + +class SEGA(System): + def __init__(self, cache_size, graph_path): + super(SEGA, self).__init__() + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = '2GHz' + self.clk_domain.voltage_domain = VoltageDomain() + self.cache_line_size = 32 + self.mem_mode = "timing" + + gpts = [GPT("8GiB", cache_size)] + gpts[0].set_vertex_range(AddrRange("4GiB")) + gpts[0].set_edge_image(f"{graph_path}/edgelist_0") + gpts[0].setReqPort(gpts[0].getRespPort()) + self.gpts = gpts + + self.ctrl = CenteralController(image_file=f"{graph_path}/vertices") + self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] + + def create_initial_bfs_update(self, init_addr, init_value): + self.ctrl.createInitialBFSUpdate(init_addr, init_value) + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) + + args = argparser.parse_args() + + return args.cache_size, args.graph, args.init_addr, args.init_value + +if __name__ == "__m5_main__": + cache_size, graph, init_addr, init_value = get_inputs() + + system = SEGA(cache_size, graph) + root = Root(full_system = False, system = system) + + m5.instantiate() + + system.create_initial_bfs_update(init_addr, init_value) + exit_event = m5.simulate() + print(f"Exited simulation at tick {m5.curTick()} " + \ + f"because {exit_event.getCause()}") diff --git a/configs/accl/sega-single.py b/configs/accl/sega-single.py new file mode 100644 index 0000000000..d9fe11a781 --- /dev/null +++ b/configs/accl/sega-single.py @@ -0,0 +1,155 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import m5 +import argparse + +from math import log +from m5.objects import * + +def interleave_addresses(plain_range, num_channels, cache_line_size): + intlv_low_bit = log(cache_line_size, 2) + intlv_bits = log(num_channels, 2) + ret = [] + for i in range(num_channels): + ret.append(AddrRange( + start=plain_range.start, + size=plain_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + xorHighBit=0, + intlvBits=intlv_bits, + intlvMatch=i)) + return ret + +class GPT(SubSystem): + def __init__(self, edge_memory_size: str, cache_size: str): + super().__init__() + self.wl_engine = WLEngine( + update_queue_size=64, + register_file_size=32 + ) + self.coalesce_engine = CoalesceEngine( + attached_memory_atom_size=32, + cache_size=cache_size, + num_mshr_entry=32, + num_tgts_per_mshr=32, + max_resp_per_cycle=4 + ) + self.push_engine = PushEngine( + push_req_queue_size=32, + attached_memory_atom_size=64, + resp_queue_size=64 + ) + + self.vertex_mem_ctrl = SimpleMemory( + latency="30ns", + latency_var="0ns", + bandwidth="32GiB/s" + ) + + self.edge_mem_ctrl = MemCtrl( + dram=DDR4_2400_8x8( + range=AddrRange(edge_memory_size), + in_addr_map=False + ) + ) + + self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port + self.push_engine.mem_port = self.edge_mem_ctrl.port + + self.mpu = MPU( + wl_engine=self.wl_engine, + coalesce_engine=self.coalesce_engine, + push_engine=self.push_engine + ) + + def getRespPort(self): + return self.mpu.in_port + def setRespPort(self, port): + self.mpu.in_port = port + + def getReqPort(self): + return self.mpu.out_port + def setReqPort(self, port): + self.mpu.out_port = port + + def set_vertex_range(self, vertex_range): + self.vertex_mem_ctrl.range = vertex_range + + def set_edge_image(self, edge_image): + self.edge_mem_ctrl.dram.image_file = edge_image + +class SEGA(System): + def __init__(self, cache_size, graph_path): + super(SEGA, self).__init__() + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = '2GHz' + self.clk_domain.voltage_domain = VoltageDomain() + self.cache_line_size = 32 + self.mem_mode = "timing" + + plain_vertex_range = AddrRange("4GiB") + self._vertex_ranges = interleave_addresses( + plain_vertex_range, + 1, + 32 + ) + + gpts = [GPT("8GiB", cache_size)] + gpts[0].set_vertex_ranges(self._vertex_ranges[0]) + gpts[0].set_edge_image(f"{graph_path}/edgelist_0") + gpts[0].setReqPort(gpts[0].getRespPort()) + self.gpts = gpts + + self.ctrl = CenteralController(image_file=f"{graph_path}/vertices") + self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] + + def create_initial_bfs_update(self, init_addr, init_value): + self.ctrl.createInitialBFSUpdate(init_addr, init_value) + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) + + args = argparser.parse_args() + + return args.cache_size, args.graph, args.init_addr, args.init_value + +if __name__ == "__m5_main__": + cache_size, graph, init_addr, init_value = get_inputs() + + system = SEGA(cache_size, graph) + root = Root(full_system = False, system = system) + + m5.instantiate() + + system.create_initial_bfs_update(init_addr, init_value) + exit_event = m5.simulate() + print(f"Exited simulation at tick {m5.curTick()} " + \ + f"because {exit_event.getCause()}") diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index 9bee76511d..0721ff977c 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -27,6 +27,7 @@ from m5.params import * from m5.proxy import * +from m5.util.pybind import PyBindMethod from m5.objects.ClockedObject import ClockedObject class CenteralController(ClockedObject): @@ -35,10 +36,9 @@ class CenteralController(ClockedObject): cxx_class = 'gem5::CenteralController' system = Param.System(Parent.any, "System this Engine is a part of") - req_port = RequestPort("Port to send updates to the outside") + + image_file = Param.String("Path to the vertex image file.") mpu_vector = VectorParam.MPU("All mpus in the system.") - init_addr = Param.Addr("The addr for the initial update") - init_value = Param.Int("The value for the initial update") - image_file = Param.String("Path to the global memory image.") + cxx_exports = [PyBindMethod("createInitialBFSUpdate")] diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py index 2d65be2949..d80142b21e 100644 --- a/src/accl/graph/sega/MPU.py +++ b/src/accl/graph/sega/MPU.py @@ -45,3 +45,4 @@ class MPU(SimObject): "each instance of MPU object.") push_engine = Param.PushEngine(NULL, "Internal PushEngine for each " "instance of MPU object.") + diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc index 9bd1941b23..d9864664b1 100644 --- a/src/accl/graph/sega/base_memory_engine.cc +++ b/src/accl/graph/sega/base_memory_engine.cc @@ -59,14 +59,18 @@ void BaseMemoryEngine::init() { AddrRangeList memory_ranges = memPort.getAddrRanges(); - // BaseMemoryEngine only supports one memory. - assert(memory_ranges.size() == 1); - - peerMemoryRange = memory_ranges.front(); - DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is %s. " - "The range is %s interleaved.\n", __func__, - peerMemoryRange.to_string(), - peerMemoryRange.interleaved() ? "" : "not"); + + if (memory_ranges.size() == 2) { + peerMemoryRange = merge(memory_ranges.front(), memory_ranges.back()); + } else if (memory_ranges.size() == 1) { + peerMemoryRange = memory_ranges.front(); + } else { + panic("Received an unacceptable number of ranges from memory."); + } + DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is " + "%s. The range is %s interleaved.\n", __func__, + peerMemoryRange.to_string(), + peerMemoryRange.interleaved() ? "" : "not"); } void diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index c6de1d8390..68b88e9e77 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -37,12 +37,9 @@ namespace gem5 { -CenteralController::CenteralController - (const CenteralControllerParams ¶ms): +CenteralController::CenteralController(const Params& params): ClockedObject(params), - system(params.system), - reqPort(name() + ".req_port", this), - maxVertexAddr(0) + system(params.system) { for (auto mpu : params.mpu_vector) { mpuVector.push_back(mpu); @@ -50,33 +47,35 @@ CenteralController::CenteralController } } -Port& -CenteralController::getPort(const std::string &if_name, PortID idx) -{ - if (if_name == "req_port") { - return reqPort; - } else { - return SimObject::getPort(if_name, idx); - } -} - void CenteralController::initState() { - // ClockedObject::initState(); - + for (auto mpu: mpuVector) { + addrRangeListMap[mpu] = mpu->getAddrRanges(); + } const auto& file = params().image_file; if (file == "") return; - auto *object = loader::createObjectFile(file, true); + auto* object = loader::createObjectFile(file, true); fatal_if(!object, "%s: Could not load %s.", name(), file); loader::debugSymbolTable.insert(*object->symtab().globals()); loader::MemoryImage image = object->buildImage(); - maxVertexAddr = image.maxAddr(); - PortProxy proxy([this](PacketPtr pkt) { functionalAccess(pkt); }, - system->cacheLineSize()); + Addr maxVertexAddr = image.maxAddr(); + + PortProxy proxy( + [this](PacketPtr pkt) { + for (auto mpu: mpuVector) { + AddrRangeList range_list = addrRangeListMap[mpu]; + for (auto range: range_list) { + if (range.contains(pkt->getAddr())) { + mpu->recvFunctional(pkt); + break; + } + } + } + }, system->cacheLineSize()); panic_if(!image.write(proxy), "%s: Unable to write image."); } @@ -84,21 +83,24 @@ CenteralController::initState() void CenteralController::startup() { - Addr initial_addr = params().init_addr; - uint32_t initial_value = params().init_value; - PacketPtr first_update = - createUpdatePacket(initial_addr, initial_value); - - if (!reqPort.blocked()) { - reqPort.sendPacket(first_update); + while(!initialUpdates.empty()) { + PacketPtr front = initialUpdates.front(); + for (auto mpu: mpuVector) { + AddrRangeList range_list = addrRangeListMap[mpu]; + for (auto range: range_list) { + if (range.contains(front->getAddr())) { + mpu->handleIncomingUpdate(front); + } + } + } + initialUpdates.pop_front(); } } template PacketPtr CenteralController::createUpdatePacket(Addr addr, T value) { - RequestPtr req = std::make_shared( - addr, sizeof(T), addr, value); + RequestPtr req = std::make_shared(addr, sizeof(T), addr, value); // Dummy PC to have PC-based prefetchers latch on; get entropy into higher // bits req->setPC(((Addr) value) << 2); @@ -106,65 +108,17 @@ CenteralController::createUpdatePacket(Addr addr, T value) PacketPtr pkt = new Packet(req, MemCmd::UpdateWL); pkt->allocate(); - // pkt->setData(data); - pkt->setLE(value); - - return pkt; -} -PacketPtr -CenteralController::createReadPacket(Addr addr, unsigned int size) -{ - RequestPtr req = std::make_shared(addr, size, 0, 0); - // Dummy PC to have PC-based prefetchers latch on; get entropy into higher - // bits - req->setPC((Addr) 0); - - // Embed it in a packet - PacketPtr pkt = new Packet(req, MemCmd::ReadReq); - pkt->allocate(); + pkt->setLE(value); return pkt; } void -CenteralController::ReqPort::sendPacket(PacketPtr pkt) -{ - panic_if(_blocked, "Should never try to send if blocked MemSide!"); - // If we can't send the packet across the port, store it for later. - if (!sendTimingReq(pkt)) - { - blockedPacket = pkt; - _blocked = true; - } -} - -bool -CenteralController::ReqPort::recvTimingResp(PacketPtr pkt) -{ - panic("recvTimingResp called on the request port."); -} - -void -CenteralController::ReqPort::recvReqRetry() -{ - panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket"); - - _blocked = false; - sendPacket(blockedPacket); - - if (!_blocked) { - blockedPacket = nullptr; - } -} - -void -CenteralController::functionalAccess(PacketPtr pkt) +CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value) { - DPRINTF(CenteralController, - "%s: Functional access for pkt->addr: %lu, pkt->size: %lu.\n", - __func__, pkt->getAddr(), pkt->getSize()); - reqPort.sendFunctional(pkt); + PacketPtr update = createUpdatePacket(init_addr, init_value); + initialUpdates.push_back(update); } void @@ -176,19 +130,6 @@ CenteralController::recvDoneSignal() } if (done) { - for (Addr addr = 0; addr < maxVertexAddr; addr += system->cacheLineSize()) { - PacketPtr pkt = createReadPacket(addr, system->cacheLineSize()); - reqPort.sendFunctional(pkt); - - int num_items = system->cacheLineSize() / sizeof(WorkListItem); - WorkListItem items[num_items]; - pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize()); - - for (int i = 0; i < num_items; i++) { - DPRINTF(FinalAnswer, "%s: WorkListItem[%lu][%d]: %s.\n", - __func__, addr, i, items[i].to_string()); - } - } exitSimLoopNow("no update left to process."); } } diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index bd272cf30d..4a4e9c7cb1 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -33,6 +33,7 @@ #include "accl/graph/base/data_structs.hh" #include "accl/graph/sega/mpu.hh" +#include "base/addr_range.hh" #include "debug/FinalAnswer.hh" #include "params/CenteralController.hh" #include "sim/clocked_object.hh" @@ -44,46 +45,24 @@ namespace gem5 class CenteralController : public ClockedObject { private: - class ReqPort : public RequestPort - { - private: - CenteralController* owner; - bool _blocked; - PacketPtr blockedPacket; - - public: - ReqPort(const std::string& name, CenteralController* owner) : - RequestPort(name, owner), owner(owner), - _blocked(false), blockedPacket(nullptr) - {} - - void sendPacket(PacketPtr pkt); - bool blocked() { return _blocked; } - - protected: - virtual bool recvTimingResp(PacketPtr pkt); - virtual void recvReqRetry(); - }; - System* system; - ReqPort reqPort; Addr maxVertexAddr; + std::deque initialUpdates; + std::vector mpuVector; + std::unordered_map addrRangeListMap; - template PacketPtr - createUpdatePacket(Addr addr, T value); - PacketPtr createReadPacket(Addr addr, unsigned int size); - void functionalAccess(PacketPtr pkt); + template PacketPtr createUpdatePacket(Addr addr, T value); public: PARAMS(CenteralController); CenteralController(const CenteralControllerParams ¶ms); - Port& getPort(const std::string &if_name, - PortID idx=InvalidPortID) override; - virtual void initState(); - virtual void startup(); + virtual void initState() override; + virtual void startup() override; + + void createInitialBFSUpdate(Addr init_addr, uint32_t init_value); void recvDoneSignal(); }; diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 62cae01613..ac62254fd6 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -127,6 +127,15 @@ int CoalesceEngine::getBlockIndex(Addr addr) { assert((addr % peerMemoryAtomSize) == 0); + // bool found = false; + // Addr trimmed_addr; + // for (auto range: peerMemoryRanges) { + // if (range.contains(addr)) { + // trimmed_addr = range.removeIntlvBits(addr); + // found = true; + // } + // } + // assert(found); Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines; } @@ -136,6 +145,15 @@ int CoalesceEngine::getBitIndexBase(Addr addr) { assert((addr % peerMemoryAtomSize) == 0); + // bool found = false; + // Addr trimmed_addr; + // for (auto range: peerMemoryRanges) { + // if (range.contains(addr)) { + // trimmed_addr = range.removeIntlvBits(addr); + // found = true; + // } + // } + // assert(found); Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); int atom_index = (int) (trimmed_addr / peerMemoryAtomSize); int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem)); @@ -147,7 +165,16 @@ Addr CoalesceEngine::getBlockAddrFromBitIndex(int index) { assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0); + // bool found = false; Addr trimmed_addr = index * sizeof(WorkListItem); + // Addr upgraded_addr; + // for (auto range: peerMemoryRanges) { + // if (range.contains(trimmed_addr)) { + // upgraded_addr = range.addIntlvBits(trimmed_addr); + // found = true; + // } + // } + // assert(found); return peerMemoryRange.addIntlvBits(trimmed_addr); } diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh index 07bd255d26..a4bf581224 100644 --- a/src/base/addr_range.hh +++ b/src/base/addr_range.hh @@ -48,6 +48,7 @@ #include "base/bitfield.hh" #include "base/cprintf.hh" +#include "base/intmath.hh" #include "base/logging.hh" #include "base/types.hh" @@ -732,6 +733,36 @@ class AddrRange { return !(*this == r); } + + friend AddrRange + merge(const AddrRange& left, const AddrRange& right) + { + assert(left.interleaved()); + assert(right.interleaved()); + assert(left.mergesWith(right)); + + int bits_org = left.masks.size(); + int bits_new = bits_org - 1; + + int left_match = left.intlvMatch; + int right_match = right.intlvMatch; + assert(std::abs(left_match - right_match) == (1 << bits_new)); + + Addr last_mask = left.masks[left.masks.size() - 1]; + int xor_high_bit_org = 0; + int xor_high_bit_new = 0; + if (!isPowerOf2(last_mask)) { + xor_high_bit_org = ceilLog2(last_mask); + xor_high_bit_new = xor_high_bit_org - 2; + } + int intlv_high_bit_org = + ceilLog2(last_mask ^ (1 << xor_high_bit_org)); + int intlv_high_bit_new = intlv_high_bit_org - 2; + + int match = std::min(left_match, right_match); + return AddrRange(left._start, left._end, intlv_high_bit_new, + xor_high_bit_new, bits_new, match); + } }; static inline AddrRangeList From a0461dea5bdcbf67dd89752790902f5e68e070fd Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 26 Sep 2022 12:07:38 -0700 Subject: [PATCH 171/247] Adding stat to measure response latency. --- configs/accl/sega-simple.py | 2 +- src/accl/graph/sega/coalesce_engine.cc | 24 +++++++++++++++++++----- src/accl/graph/sega/coalesce_engine.hh | 2 ++ 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py index e0a4fcc89e..fffc273ee1 100644 --- a/configs/accl/sega-simple.py +++ b/configs/accl/sega-simple.py @@ -113,7 +113,7 @@ def __init__( ): super(SEGA, self).__init__() self.clk_domain = SrcClockDomain() - self.clk_domain.clock = '1GHz' + self.clk_domain.clock = '4GHz' self.clk_domain.voltage_domain = VoltageDomain() self.cache_line_size = 32 self.mem_mode = "timing" diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index ac62254fd6..43d352da30 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -274,6 +274,7 @@ CoalesceEngine::recvWLRead(Addr addr) "for cacheBlocks[%d].\n", __func__, addr, block_index); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); + stats.numVertexReads++; return true; } else { // miss @@ -618,9 +619,16 @@ CoalesceEngine::processNextResponseEvent() DPRINTF(CoalesceEngine, "%s: Popped a response from responseQueue. " "responseQueue.size = %d.\n", __func__, responseQueue.size()); - if ((num_responses_sent >= maxRespPerCycle) || - (responseQueue.empty())) { - break; + stats.responseQueueLatency.sample( + waiting_ticks * 1e9 / getClockFrequency()); + if (num_responses_sent >= maxRespPerCycle) { + if (!responseQueue.empty()) { + stats.responsePortShortage++; + } + break; + } + if (responseQueue.empty()) { + break; } } @@ -1127,6 +1135,9 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "Number of cache rejections caused by entry shortage."), ADD_STAT(mshrTargetShortage, statistics::units::Count::get(), "Number of cache rejections caused by target shortage."), + ADD_STAT(responsePortShortage, statistics::units::Count::get(), + "Number of times a response has been " + "delayed because of port shortage. "), ADD_STAT(numMemoryBlocks, statistics::units::Count::get(), "Number of times memory bandwidth was not available."), ADD_STAT(numDoubleMemReads, statistics::units::Count::get(), @@ -1156,6 +1167,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "Histogram on the length of the mshr entries."), ADD_STAT(bitvectorLength, statistics::units::Count::get(), "Histogram of the length of the bitvector."), + ADD_STAT(responseQueueLatency, statistics::units::Second::get(), + "Histogram of the response latency to WLEngine. (ns)"), ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(), "Histogram of the latency of processing a memory function.") { @@ -1166,8 +1179,6 @@ CoalesceEngine::CoalesceStats::regStats() { using namespace statistics; - mshrEntryLength.init(coalesce.params().num_tgts_per_mshr); - bitvectorLength.init(64); bitvectorSearchStatus.init(NUM_STATUS); bitvectorSearchStatus.subname(0, "PENDING_READ"); bitvectorSearchStatus.subname(1, "IN_CACHE"); @@ -1181,6 +1192,9 @@ CoalesceEngine::CoalesceStats::regStats() vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime; + mshrEntryLength.init(coalesce.params().num_tgts_per_mshr); + bitvectorLength.init(64); + responseQueueLatency.init(64); memoryFunctionLatency.init(64); } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 262f75fbcf..705285ba23 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -165,6 +165,7 @@ class CoalesceEngine : public BaseMemoryEngine statistics::Scalar readHitUnderMisses; statistics::Scalar mshrEntryShortage; statistics::Scalar mshrTargetShortage; + statistics::Scalar responsePortShortage; statistics::Scalar numMemoryBlocks; statistics::Scalar numDoubleMemReads; statistics::Scalar verticesPulled; @@ -181,6 +182,7 @@ class CoalesceEngine : public BaseMemoryEngine statistics::Histogram mshrEntryLength; statistics::Histogram bitvectorLength; + statistics::Histogram responseQueueLatency; statistics::Histogram memoryFunctionLatency; }; From fbbd888e40e6e23b61331aee037a1ebc1a71e695 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 26 Sep 2022 17:01:13 -0700 Subject: [PATCH 172/247] Adding stats to count model inaccuracies. --- src/accl/graph/sega/coalesce_engine.cc | 9 +++++++-- src/accl/graph/sega/coalesce_engine.hh | 3 ++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 43d352da30..0a4a041176 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -828,6 +828,8 @@ CoalesceEngine::processNextApplyEvent() } DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); + } else { + stats.numInvalidApplies++; } applyQueue.pop_front(); @@ -966,7 +968,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) "the current write back scheduled at tick %lu for " "the right function scheduled later.\n", __func__, block_index, schedule_tick); - stats.numInvalidMemFunctions++; + stats.numInvalidWriteBacks++; } } @@ -1151,7 +1153,10 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "Time of the last pull request. (Relative to reset_stats)"), ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(), "Time of the last vertex push. (Relative to reset_stats)"), - ADD_STAT(numInvalidMemFunctions, statistics::units::Count::get(), + ADD_STAT(numInvalidApplies, statistics::units::Count::get(), + "Number of times a line has become busy" + " while waiting to be applied."), + ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(), "Number of times a scheduled memory function has been invalid."), ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(), "Distribution for the location of vertex searches."), diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 705285ba23..b1f5b1fea1 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -172,7 +172,8 @@ class CoalesceEngine : public BaseMemoryEngine statistics::Scalar verticesPushed; statistics::Scalar lastVertexPullTime; statistics::Scalar lastVertexPushTime; - statistics::Scalar numInvalidMemFunctions; + statistics::Scalar numInvalidApplies; + statistics::Scalar numInvalidWriteBacks; statistics::Vector bitvectorSearchStatus; From 411bfa11be14dda13cc38351c2efeab4737503da Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 29 Sep 2022 15:11:01 -0700 Subject: [PATCH 173/247] style fix. --- src/accl/graph/sega/push_engine.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index af1c904eda..6ff1f77c45 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -335,4 +335,4 @@ PushEngine::PushStats::regStats() edgeQueueLatency.init(64); } -} +} // namespace gem5 From bf9bed1ca66b949bba7d03001f34fb6ed30c97b2 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Wed, 28 Sep 2022 12:37:13 -0700 Subject: [PATCH 174/247] Adding multiple queues and ports in pushEngine --- src/accl/graph/base/data_structs.hh | 24 +++++++- src/accl/graph/sega/MPU.py | 8 ++- src/accl/graph/sega/mpu.cc | 90 ++++++++++++++++++++++++++++- src/accl/graph/sega/mpu.hh | 15 ++++- src/accl/graph/sega/push_engine.cc | 12 +++- src/accl/graph/sega/push_engine.hh | 2 +- 6 files changed, 137 insertions(+), 14 deletions(-) diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index a46aaf2de9..d3db3edda5 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -90,7 +90,7 @@ struct __attribute__ ((packed)) Edge static_assert(isPowerOf2(sizeof(WorkListItem))); static_assert(isPowerOf2(sizeof(Edge))); -struct CompleteEdge { +struct MetaEdge { uint64_t src; uint64_t dst; uint32_t weight; @@ -98,17 +98,35 @@ struct CompleteEdge { uint64_t entrance; - CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value, uint64_t entrance): + MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value, uint64_t entrance): src(src), dst(dst), weight(weight), value(value), entrance(entrance) {} std::string to_string() { - return csprintf("CompleteEdge{src: %lu, dst:%lu, weight: %u}", + return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u}", src, dst, weight); } }; +struct Update { + uint64_t src; + uint64_t dst; + uint32_t value; + + Update(): src(0), dst(0), value(0) + {} + Update(uint64_t src, uint64_t dst, uint32_t value): + src(src), dst(dst), value(value) + {} + + std::string to_string() + { + return csprintf("Update{src: %lu, dst:%lu, value: %u}", + src, dst, value); + } +}; + template class UniqueFIFO { diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py index d80142b21e..1ea6a868a9 100644 --- a/src/accl/graph/sega/MPU.py +++ b/src/accl/graph/sega/MPU.py @@ -27,9 +27,9 @@ from m5.params import * from m5.proxy import * -from m5.SimObject import SimObject +from m5.objects.ClockedObject import ClockedObject -class MPU(SimObject): +class MPU(ClockedObject): type = "MPU" cxx_header = "accl/graph/sega/mpu.hh" cxx_class = "gem5::MPU" @@ -39,6 +39,8 @@ class MPU(SimObject): in_port = ResponsePort("Port to receive updates from outside") out_port = RequestPort("Port to send updates to the outside") + out_ports = VectorRequestPort("Ports to remote MPUs ") + wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of " "MPU object.") coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for " @@ -46,3 +48,5 @@ class MPU(SimObject): push_engine = Param.PushEngine(NULL, "Internal PushEngine for each " "instance of MPU object.") + update_queue_size = Param.Int(16, "Maximum number of entries " + "for each update queue.") diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc index 63aa474542..8897e5a959 100644 --- a/src/accl/graph/sega/mpu.cc +++ b/src/accl/graph/sega/mpu.cc @@ -29,23 +29,32 @@ #include "accl/graph/sega/mpu.hh" #include "accl/graph/sega/centeral_controller.hh" +#include "mem/packet_access.hh" #include "sim/sim_exit.hh" namespace gem5 { MPU::MPU(const Params& params): - SimObject(params), + ClockedObject(params), system(params.system), wlEngine(params.wl_engine), coalesceEngine(params.coalesce_engine), pushEngine(params.push_engine), inPort(name() + ".inPort", this), - outPort(name() + ".outPort", this) + outPort(name() + ".outPort", this), + updateQueueSize(params.update_queue_size), + nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name()) { wlEngine->registerMPU(this); coalesceEngine->registerMPU(this); pushEngine->registerMPU(this); + + + for (int i = 0; i < params.port_out_ports_connection_count; ++i) { + + outports.emplace_back(name() + ".out_ports" + std::to_string(i), this); + } } Port& @@ -55,8 +64,10 @@ MPU::getPort(const std::string& if_name, PortID idx) return inPort; } else if (if_name == "out_port") { return outPort; + } else if (if_name == "outPorts") { + return outports[idx]; } else { - return SimObject::getPort(if_name, idx); + return ClockedObject::getPort(if_name, idx); } } @@ -166,6 +177,79 @@ MPU::recvWLWrite(Addr addr, WorkListItem wl) coalesceEngine->recvWLWrite(addr, wl); } +bool +MPU::enqueueUpdate(Update update) +{ + // Creating the packet + Addr dst_addr = update.dst; + bool found_locally = false; + for (auto range : localAddrRange) { + found_locally |= range.contains(dst_addr); + } + + for (int i = 0; i < outports.size(); i++) { + AddrRangeList addrList = outports[i].getAddrRanges(); + for (auto range : addrList) { + if (range.contains(dst_addr)) { + if (updateQueues[i].size() < updateQueueSize) { + updateQueues[i].emplace_back(update, curTick()); + return true; + } else { + return false; + } + } + } + } + + panic("The update created does not match to any outport."); +} + +template PacketPtr +MPU::createUpdatePacket(Addr addr, T value) +{ + RequestPtr req = std::make_shared(addr, sizeof(T), 0, 0); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) 1) << 2); + + // FIXME: MemCmd::UpdateWL + PacketPtr pkt = new Packet(req, MemCmd::UpdateWL); + + pkt->allocate(); + // pkt->setData(data); + pkt->setLE(value); + + return pkt; +} + +void +MPU::processNextUpdatePushEvent() +{ + int next_time_send = 0; + + for (int i = 0; i < updateQueues.size(); i++) { + Update update; + Tick entrance_tick; + std::tie(update, entrance_tick) = updateQueues[i].front(); + if (outports[i].blocked()) { + continue; + } + PacketPtr pkt = createUpdatePacket(update.dst, update.value); + outports[i].sendPacket(pkt); + updateQueues[i].pop_front(); + if (updateQueues[i].size() > 0) { + next_time_send += 1; + } + } + + assert(!nextUpdatePushEvent.scheduled()); + if (next_time_send > 0) { + schedule(nextUpdatePushEvent, nextCycle()); + } + + +} + void MPU::recvVertexPush(Addr addr, WorkListItem wl) { diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index edf0350caf..d7042540f0 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -36,7 +36,7 @@ #include "base/addr_range.hh" #include "mem/packet.hh" #include "mem/port.hh" -#include "sim/sim_object.hh" +#include "sim/clocked_object.hh" #include "sim/system.hh" #include "params/MPU.hh" @@ -45,7 +45,7 @@ namespace gem5 class CenteralController; -class MPU : public SimObject +class MPU : public ClockedObject { private: class RespPort : public ResponsePort @@ -99,6 +99,16 @@ class MPU : public SimObject AddrRangeList localAddrRange; + uint32_t updateQueueSize; + + std::vector outports; + std::vector>> updateQueues; + + template PacketPtr createUpdatePacket(Addr addr, T value); + + EventFunctionWrapper nextUpdatePushEvent; + void processNextUpdatePushEvent(); + public: PARAMS(MPU); MPU(const Params& params); @@ -115,6 +125,7 @@ class MPU : public SimObject void handleIncomingWL(Addr addr, WorkListItem wl); bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); } void recvWLWrite(Addr addr, WorkListItem wl); + bool enqueueUpdate(Update update); int workCount() { return coalesceEngine->workCount(); } void recvVertexPull() { return coalesceEngine->recvVertexPull(); } diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 6ff1f77c45..4546ceee47 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -224,7 +224,7 @@ PushEngine::handleMemResp(PacketPtr pkt) PushInfo push_info = reqInfoMap[pkt->req]; pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize); - std::deque edges; + std::deque edges; for (int i = 0; i < push_info.numElements; i++) { Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge)); Addr edge_dst = edge->neighbor; @@ -255,8 +255,8 @@ PushEngine::processNextPushEvent() return; } - std::deque& edge_list = edgeQueue.front(); - CompleteEdge curr_edge = edge_list.front(); + std::deque& edge_list = edgeQueue.front(); + MetaEdge curr_edge = edge_list.front(); DPRINTF(PushEngine, "%s: The edge to process is %s.\n", __func__, curr_edge.to_string()); @@ -267,6 +267,12 @@ PushEngine::processNextPushEvent() curr_edge.dst, update_value); owner->sendPacket(update); + + Update update_2(curr_edge.src, curr_edge.dst, update_value); + (!owner->enqueueUpdate(update_2)) { + // edge_list.pop_front(); + // edge_list.push_back(curr_edge); + } stats.numUpdates++; DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to addr: %lu " "with value: %d.\n", __func__, curr_edge.src, diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 5d2277eb5a..d6763e3ab7 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -102,7 +102,7 @@ class PushEngine : public BaseMemoryEngine int onTheFlyMemReqs; int edgeQueueSize; - std::deque> edgeQueue; + std::deque> edgeQueue; std::string workload; uint32_t propagate(uint32_t value, uint32_t weight); From 32c75813f3bff0af05a960ad8b40d2f731a9296d Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Wed, 28 Sep 2022 13:20:32 -0700 Subject: [PATCH 175/247] Changing propagate function --- src/accl/graph/sega/PushEngine.py | 7 ++- src/accl/graph/sega/push_engine.cc | 80 ++++++++++++------------------ src/accl/graph/sega/push_engine.hh | 5 +- 3 files changed, 41 insertions(+), 51 deletions(-) diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py index ad9ddfefcf..7dba86aff2 100644 --- a/src/accl/graph/sega/PushEngine.py +++ b/src/accl/graph/sega/PushEngine.py @@ -40,6 +40,9 @@ class PushEngine(BaseMemoryEngine): # significantly bigger than push_req_queue_size resp_queue_size = Param.Int("Size of the response queue in the " "push engine where it stores the " - "edges read from memory") + "edges read from memory.") + + max_propagates_per_cycle = Param.Int(4, "Maximum number of propagates " + "done per cycle.") - workload = Param.String("BFS", "Name of the workload") + workload = Param.String("BFS", "Name of the workload.") diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 4546ceee47..c82a4c88be 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -42,10 +42,11 @@ PushEngine::PushEngine(const Params& params): lastIdleEntranceTick(0), numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size), onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size), + maxPropagatesPerCycle(params.max_propagates_per_cycle), workload(params.workload), nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()), nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()), - nextPushEvent([this] { processNextPushEvent(); }, name()), + nextPropagateEvent([this] { processNextPropagateEvent(); }, name()), stats(*this) {} @@ -55,16 +56,6 @@ PushEngine::registerMPU(MPU* mpu) owner = mpu; } -void -PushEngine::recvReqRetry() -{ - DPRINTF(PushEngine, "%s: Received a req retry.\n", __func__); - if (nextPushEvent.pending()) { - nextPushEvent.wake(); - schedule(nextPushEvent, nextCycle()); - } -} - bool PushEngine::vertexSpace() { @@ -238,57 +229,52 @@ PushEngine::handleMemResp(PacketPtr pkt) delete pkt_data; delete pkt; - if ((!nextPushEvent.pending()) && - (!nextPushEvent.scheduled())) { - schedule(nextPushEvent, nextCycle()); + if (!nextPropagateEvent.scheduled()) { + schedule(nextPropagateEvent, nextCycle()); } return true; } // TODO: Add a parameter to allow for doing multiple pushes at the same time. void -PushEngine::processNextPushEvent() +PushEngine::processNextPropagateEvent() { - if (owner->blocked()) { - stats.numNetBlocks++; - nextPushEvent.sleep(); - return; - } + int num_propagates = 0; + while(true) { + std::deque& edge_list = edgeQueue.front(); + MetaEdge curr_edge = edge_list.front(); - std::deque& edge_list = edgeQueue.front(); - MetaEdge curr_edge = edge_list.front(); + DPRINTF(PushEngine, "%s: The edge to process is %s.\n", + __func__, curr_edge.to_string()); - DPRINTF(PushEngine, "%s: The edge to process is %s.\n", - __func__, curr_edge.to_string()); + uint32_t update_value = propagate(curr_edge.value, curr_edge.weight); - // TODO: Implement propagate function here - uint32_t update_value = propagate(curr_edge.value, curr_edge.weight); - PacketPtr update = createUpdatePacket( - curr_edge.dst, update_value); - - owner->sendPacket(update); - - Update update_2(curr_edge.src, curr_edge.dst, update_value); - (!owner->enqueueUpdate(update_2)) { - // edge_list.pop_front(); - // edge_list.push_back(curr_edge); - } - stats.numUpdates++; - DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to addr: %lu " - "with value: %d.\n", __func__, curr_edge.src, + Update update(curr_edge.src, curr_edge.dst, update_value); + edge_list.pop_front(); + if (owner->enqueueUpdate(update)) { + DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to " + "addr: %lu with value: %d.\n", __func__, curr_edge.src, curr_edge.dst, update_value); + stats.numUpdates++; + stats.edgeQueueLatency.sample( + (curTick() - curr_edge.entrance) * 1e9 / getClockFrequency()); + } else { + edge_list.push_back(curr_edge); + } - stats.edgeQueueLatency.sample( - (curTick() - curr_edge.entrance) * 1e9 / getClockFrequency()); - edge_list.pop_front(); - if (edge_list.empty()) { - edgeQueue.pop_front(); + num_propagates++; + if (num_propagates >= maxPropagatesPerCycle) { + break; + } + + if (edge_list.empty()) { + edgeQueue.pop_front(); + } } - assert(!nextPushEvent.pending()); - assert(!nextPushEvent.scheduled()); + assert(!nextPropagateEvent.scheduled()); if (!edgeQueue.empty()) { - schedule(nextPushEvent, nextCycle()); + schedule(nextPropagateEvent, nextCycle()); } } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index d6763e3ab7..f3304a8e2a 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -102,6 +102,7 @@ class PushEngine : public BaseMemoryEngine int onTheFlyMemReqs; int edgeQueueSize; + int maxPropagatesPerCycle; std::deque> edgeQueue; std::string workload; @@ -117,8 +118,8 @@ class PushEngine : public BaseMemoryEngine MemoryEvent nextMemoryReadEvent; void processNextMemoryReadEvent(); - MemoryEvent nextPushEvent; - void processNextPushEvent(); + EventFunctionWrapper nextPropagateEvent; + void processNextPropagateEvent(); struct PushStats : public statistics::Group { From 666ab3de782318df4c94fa1baa52c94fd11b6c13 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 29 Sep 2022 14:59:25 -0700 Subject: [PATCH 176/247] Pushing on Marjan's behalf, refactored out_port to vector-port. --- configs/accl/sega-single-simple.py | 6 +- configs/accl/sega-single.py | 4 +- src/accl/graph/base/data_structs.hh | 8 +-- src/accl/graph/sega/MPU.py | 3 +- src/accl/graph/sega/mpu.cc | 85 +++++++++++++++-------------- src/accl/graph/sega/mpu.hh | 20 ++++--- src/accl/graph/sega/push_engine.cc | 64 +++++++++------------- src/accl/graph/sega/push_engine.hh | 3 +- 8 files changed, 94 insertions(+), 99 deletions(-) diff --git a/configs/accl/sega-single-simple.py b/configs/accl/sega-single-simple.py index a87e6c53bb..92c1c9cbcb 100644 --- a/configs/accl/sega-single-simple.py +++ b/configs/accl/sega-single-simple.py @@ -92,10 +92,10 @@ def getRespPort(self): def setRespPort(self, port): self.mpu.in_port = port - def getReqPort(self): - return self.mpu.out_port def setReqPort(self, port): - self.mpu.out_port = port + self.mpu.out_ports = port + def getReqPort(self): + return self.mpu.out_ports def set_vertex_range(self, vertex_range): self.vertex_mem_ctrl.range = vertex_range diff --git a/configs/accl/sega-single.py b/configs/accl/sega-single.py index d9fe11a781..e4f7942f42 100644 --- a/configs/accl/sega-single.py +++ b/configs/accl/sega-single.py @@ -92,9 +92,9 @@ def setRespPort(self, port): self.mpu.in_port = port def getReqPort(self): - return self.mpu.out_port + return self.mpu.out_ports def setReqPort(self, port): - self.mpu.out_port = port + self.mpu.out_ports = port def set_vertex_range(self, vertex_range): self.vertex_mem_ctrl.range = vertex_range diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index d3db3edda5..34c8eb98ce 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -96,10 +96,10 @@ struct MetaEdge { uint32_t weight; uint32_t value; - uint64_t entrance; - - MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value, uint64_t entrance): - src(src), dst(dst), weight(weight), value(value), entrance(entrance) + MetaEdge(): src(0), dst(0), weight(0), value(0) + {} + MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value): + src(src), dst(dst), weight(weight), value(value) {} std::string to_string() diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py index 1ea6a868a9..aad2e060d1 100644 --- a/src/accl/graph/sega/MPU.py +++ b/src/accl/graph/sega/MPU.py @@ -37,9 +37,8 @@ class MPU(ClockedObject): system = Param.System(Parent.any, "System this MPU is a part of") in_port = ResponsePort("Port to receive updates from outside") - out_port = RequestPort("Port to send updates to the outside") - out_ports = VectorRequestPort("Ports to remote MPUs ") + out_ports = VectorRequestPort("Outgoing ports to all MPUs") wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of " "MPU object.") diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc index 8897e5a959..f86c7e02b7 100644 --- a/src/accl/graph/sega/mpu.cc +++ b/src/accl/graph/sega/mpu.cc @@ -29,6 +29,7 @@ #include "accl/graph/sega/mpu.hh" #include "accl/graph/sega/centeral_controller.hh" +#include "debug/MPU.hh" #include "mem/packet_access.hh" #include "sim/sim_exit.hh" @@ -42,7 +43,6 @@ MPU::MPU(const Params& params): coalesceEngine(params.coalesce_engine), pushEngine(params.push_engine), inPort(name() + ".inPort", this), - outPort(name() + ".outPort", this), updateQueueSize(params.update_queue_size), nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name()) { @@ -52,8 +52,9 @@ MPU::MPU(const Params& params): for (int i = 0; i < params.port_out_ports_connection_count; ++i) { - - outports.emplace_back(name() + ".out_ports" + std::to_string(i), this); + outPorts.emplace_back( + name() + ".outPorts" + std::to_string(i), this, i); + updateQueues.emplace_back(); } } @@ -62,10 +63,8 @@ MPU::getPort(const std::string& if_name, PortID idx) { if (if_name == "in_port") { return inPort; - } else if (if_name == "out_port") { - return outPort; - } else if (if_name == "outPorts") { - return outports[idx]; + } else if (if_name == "out_ports") { + return outPorts[idx]; } else { return ClockedObject::getPort(if_name, idx); } @@ -76,6 +75,9 @@ MPU::init() { localAddrRange = getAddrRanges(); inPort.sendRangeChange(); + for (int i = 0; i < outPorts.size(); i++){ + portAddrMap[outPorts[i].id()] = getAddrRanges(); + } } void @@ -137,8 +139,6 @@ MPU::ReqPort::sendPacket(PacketPtr pkt) if (!sendTimingReq(pkt)) { blockedPacket = pkt; - } else { - owner->recvReqRetry(); } } @@ -157,6 +157,17 @@ MPU::ReqPort::recvReqRetry() PacketPtr pkt = blockedPacket; blockedPacket = nullptr; sendPacket(pkt); + if (blockedPacket == nullptr) { + owner->recvReqRetry(); + } +} + +void +MPU::recvReqRetry() +{ + if (!nextUpdatePushEvent.scheduled()) { + schedule(nextUpdatePushEvent, nextCycle()); + } } bool @@ -180,28 +191,34 @@ MPU::recvWLWrite(Addr addr, WorkListItem wl) bool MPU::enqueueUpdate(Update update) { - // Creating the packet Addr dst_addr = update.dst; bool found_locally = false; + bool accepted = false; for (auto range : localAddrRange) { found_locally |= range.contains(dst_addr); } - - for (int i = 0; i < outports.size(); i++) { - AddrRangeList addrList = outports[i].getAddrRanges(); - for (auto range : addrList) { + DPRINTF(MPU, "%s: TESSSSTSSSS %d, %d, %llu.\n", + __func__, outPorts.size(), updateQueues[0].size(), dst_addr); + for (int i = 0; i < outPorts.size(); i++) { + AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()]; + for (auto range : addr_range_list) { if (range.contains(dst_addr)) { if (updateQueues[i].size() < updateQueueSize) { + DPRINTF(MPU, "%s: Queue %d received an update.\n", + __func__, i); updateQueues[i].emplace_back(update, curTick()); - return true; - } else { - return false; + accepted = true; + break; } } } } - panic("The update created does not match to any outport."); + if (accepted && (!nextUpdatePushEvent.scheduled())) { + schedule(nextUpdatePushEvent, nextCycle()); + } + + return accepted; } template PacketPtr @@ -228,14 +245,19 @@ MPU::processNextUpdatePushEvent() int next_time_send = 0; for (int i = 0; i < updateQueues.size(); i++) { + if (updateQueues[i].empty()) { + continue; + } + if (outPorts[i].blocked()) { + continue; + } Update update; Tick entrance_tick; std::tie(update, entrance_tick) = updateQueues[i].front(); - if (outports[i].blocked()) { - continue; - } PacketPtr pkt = createUpdatePacket(update.dst, update.value); - outports[i].sendPacket(pkt); + outPorts[i].sendPacket(pkt); + DPRINTF(MPU, "%s: Sent update from addr: %lu to addr: %lu with value: " + "%d.\n", __func__, update.src, update.dst, update.value); updateQueues[i].pop_front(); if (updateQueues[i].size() > 0) { next_time_send += 1; @@ -256,25 +278,6 @@ MPU::recvVertexPush(Addr addr, WorkListItem wl) pushEngine->recvVertexPush(addr, wl); } -void -MPU::sendPacket(PacketPtr pkt) -{ - bool found_locally = false; - for (auto range : localAddrRange) { - found_locally |= range.contains(pkt->getAddr()); - } - - if (found_locally) { - // TODO: count number of local updates - - } else { - // TOOD: count number of remote updates - - } - - outPort.sendPacket(pkt); -} - void MPU::recvDoneSignal() { diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index d7042540f0..1a642e7873 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -29,6 +29,9 @@ #ifndef __ACCL_GRAPH_SEGA_MPU_HH__ #define __ACCL_GRAPH_SEGA_MPU_HH__ +#include +#include + #include "accl/graph/base/data_structs.hh" #include "accl/graph/sega/coalesce_engine.hh" #include "accl/graph/sega/push_engine.hh" @@ -74,13 +77,16 @@ class MPU : public ClockedObject private: MPU* owner; PacketPtr blockedPacket; + PortID _id; public: - ReqPort(const std::string& name, MPU* owner) : - RequestPort(name, owner), owner(owner), blockedPacket(nullptr) + ReqPort(const std::string& name, MPU* owner, PortID id) : + RequestPort(name, owner), + owner(owner), blockedPacket(nullptr), _id(id) {} void sendPacket(PacketPtr pkt); bool blocked() { return (blockedPacket != nullptr); } + PortID id() { return _id; } protected: virtual bool recvTimingResp(PacketPtr pkt); @@ -95,15 +101,17 @@ class MPU : public ClockedObject PushEngine* pushEngine; RespPort inPort; - ReqPort outPort; AddrRangeList localAddrRange; uint32_t updateQueueSize; - std::vector outports; + std::unordered_map portAddrMap; + + std::vector outPorts; std::vector>> updateQueues; + template PacketPtr createUpdatePacket(Addr addr, T value); EventFunctionWrapper nextUpdatePushEvent; @@ -133,9 +141,7 @@ class MPU : public ClockedObject void start() { return pushEngine->start(); } void recvVertexPush(Addr addr, WorkListItem wl); - bool blocked() { return outPort.blocked(); } - void sendPacket(PacketPtr pkt); - void recvReqRetry() { pushEngine->recvReqRetry(); } + void recvReqRetry(); void recvDoneSignal(); bool done(); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index c82a4c88be..d533f1ea79 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -215,15 +215,18 @@ PushEngine::handleMemResp(PacketPtr pkt) PushInfo push_info = reqInfoMap[pkt->req]; pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize); - std::deque edges; + std::deque> edges; for (int i = 0; i < push_info.numElements; i++) { Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge)); Addr edge_dst = edge->neighbor; uint32_t edge_weight = edge->weight; - edges.emplace_back( - push_info.src, edge_dst, edge_weight, push_info.value, curTick()); + MetaEdge meta_edge( + push_info.src, edge_dst, edge_weight, push_info.value); + edges.emplace_back(meta_edge, curTick()); } + assert(!edges.empty()); edgeQueue.push_back(edges); + onTheFlyMemReqs--; reqInfoMap.erase(pkt->req); delete pkt_data; @@ -235,40 +238,44 @@ PushEngine::handleMemResp(PacketPtr pkt) return true; } -// TODO: Add a parameter to allow for doing multiple pushes at the same time. void PushEngine::processNextPropagateEvent() { int num_propagates = 0; while(true) { - std::deque& edge_list = edgeQueue.front(); - MetaEdge curr_edge = edge_list.front(); + std::deque>& edge_list = edgeQueue.front(); + MetaEdge meta_edge; + Tick entrance_tick; + std::tie(meta_edge, entrance_tick) = edge_list.front(); DPRINTF(PushEngine, "%s: The edge to process is %s.\n", - __func__, curr_edge.to_string()); - - uint32_t update_value = propagate(curr_edge.value, curr_edge.weight); + __func__, meta_edge.to_string()); - Update update(curr_edge.src, curr_edge.dst, update_value); + uint32_t update_value = propagate(meta_edge.value, meta_edge.weight); + Update update(meta_edge.src, meta_edge.dst, update_value); edge_list.pop_front(); + if (owner->enqueueUpdate(update)) { - DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to " - "addr: %lu with value: %d.\n", __func__, curr_edge.src, - curr_edge.dst, update_value); + DPRINTF(PushEngine, "%s: Sending %s to port queues.\n", + __func__, meta_edge.to_string()); stats.numUpdates++; stats.edgeQueueLatency.sample( - (curTick() - curr_edge.entrance) * 1e9 / getClockFrequency()); + (curTick() - entrance_tick) * 1e9 / getClockFrequency()); } else { - edge_list.push_back(curr_edge); + edge_list.emplace_back(meta_edge, entrance_tick); } - num_propagates++; - if (num_propagates >= maxPropagatesPerCycle) { + if (edge_list.empty()) { + edgeQueue.pop_front(); + } + + if (edgeQueue.empty()) { break; } - if (edge_list.empty()) { - edgeQueue.pop_front(); + num_propagates++; + if (num_propagates >= maxPropagatesPerCycle) { + break; } } @@ -278,25 +285,6 @@ PushEngine::processNextPropagateEvent() } } -template PacketPtr -PushEngine::createUpdatePacket(Addr addr, T value) -{ - RequestPtr req = std::make_shared( - addr, sizeof(T), 0, _requestorId); - // Dummy PC to have PC-based prefetchers latch on; get entropy into higher - // bits - req->setPC(((Addr) _requestorId) << 2); - - // FIXME: MemCmd::UpdateWL - PacketPtr pkt = new Packet(req, MemCmd::UpdateWL); - - pkt->allocate(); - // pkt->setData(data); - pkt->setLE(value); - - return pkt; -} - PushEngine::PushStats::PushStats(PushEngine &_push) : statistics::Group(&_push), push(_push), diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index f3304a8e2a..fed6909733 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -103,11 +103,10 @@ class PushEngine : public BaseMemoryEngine int onTheFlyMemReqs; int edgeQueueSize; int maxPropagatesPerCycle; - std::deque> edgeQueue; + std::deque>> edgeQueue; std::string workload; uint32_t propagate(uint32_t value, uint32_t weight); - template PacketPtr createUpdatePacket(Addr addr, T value); bool vertexSpace(); bool workLeft(); From 194a5e4983af2498452daba971db27a2468148b6 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Fri, 30 Sep 2022 08:37:23 -0700 Subject: [PATCH 177/247] Attempting to add multi-inports to MPU --- configs/accl/sega-single-simple.py | 4 +-- configs/accl/sega.py | 49 ++++++++++++------------------ src/accl/graph/sega/MPU.py | 5 +-- src/accl/graph/sega/mpu.cc | 37 +++++++++++++--------- src/accl/graph/sega/mpu.hh | 13 ++++---- src/accl/graph/sega/wl_engine.cc | 2 +- 6 files changed, 55 insertions(+), 55 deletions(-) diff --git a/configs/accl/sega-single-simple.py b/configs/accl/sega-single-simple.py index 92c1c9cbcb..eacb16d3d1 100644 --- a/configs/accl/sega-single-simple.py +++ b/configs/accl/sega-single-simple.py @@ -88,9 +88,9 @@ def __init__(self, edge_memory_size: str, cache_size: str): ) def getRespPort(self): - return self.mpu.in_port + return self.mpu.in_ports def setRespPort(self, port): - self.mpu.in_port = port + self.mpu.in_ports = port def setReqPort(self, port): self.mpu.out_ports = port diff --git a/configs/accl/sega.py b/configs/accl/sega.py index a67551a5fd..455d081145 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -78,18 +78,19 @@ def __init__(self, edge_memory_size: str, cache_size: str): self.mpu = MPU( wl_engine=self.wl_engine, coalesce_engine=self.coalesce_engine, - push_engine=self.push_engine + push_engine=self.push_engine, + update_queue_size=16 ) def getRespPort(self): - return self.mpu.in_port + return self.mpu.in_ports def setRespPort(self, port): - self.mpu.in_port = port + self.mpu.in_ports = port def getReqPort(self): - return self.mpu.out_port + return self.mpu.out_ports def setReqPort(self, port): - self.mpu.out_port = port + self.mpu.out_ports = port def set_vertex_range(self, vertex_range): self.vertex_mem_ctrl.dram.range = vertex_range @@ -97,14 +98,7 @@ def set_edge_image(self, edge_image): self.edge_mem_ctrl.dram.image_file = edge_image class SEGA(System): - def __init__( - self, - num_mpus, - cache_size, - graph_path, - first_addr, - first_value - ): + def __init__(self, num_mpus, cache_size, graph_path): super(SEGA, self).__init__() self.clk_domain = SrcClockDomain() self.clk_domain.clock = '1GHz' @@ -112,19 +106,7 @@ def __init__( self.cache_line_size = 32 self.mem_mode = "timing" - self.interconnect = NoncoherentXBar( - frontend_latency=1, - forward_latency=1, - response_latency=1, - width=64 - ) - - self.ctrl = CenteralController( - init_addr=first_addr, - init_value=first_value, - image_file=f"{graph_path}/vertices" - ) - self.ctrl.req_port = self.interconnect.cpu_side_ports + self.ctrl = CenteralController(image_file=f"{graph_path}/vertices") vertex_ranges = interleave_addresses( AddrRange(start=0, size="4GiB"), @@ -137,13 +119,18 @@ def __init__( gpt = GPT("8GiB", cache_size) gpt.set_vertex_range(vertex_ranges[i]) gpt.set_edge_image(f"{graph_path}/edgelist_{i}") - gpt.setReqPort(self.interconnect.cpu_side_ports) - gpt.setRespPort(self.interconnect.mem_side_ports) gpts.append(gpt) + # Creating the interconnect among mpus + for gpt_0 in gpts: + for gpt_1 in gpts: + gpt_0.setReqPort(gpt_1.getRespPort()) self.gpts = gpts self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] + def create_initial_bfs_update(self, init_addr, init_value): + self.ctrl.createInitialBFSUpdate(init_addr, init_value) + def get_inputs(): argparser = argparse.ArgumentParser() argparser.add_argument("num_gpts", type=int) @@ -160,10 +147,12 @@ def get_inputs(): if __name__ == "__m5_main__": num_gpts, cache_size, graph, init_addr, init_value = get_inputs() - system = SEGA(num_gpts, cache_size, graph, init_addr, init_value) + system = SEGA(num_gpts, cache_size, graph) root = Root(full_system = False, system = system) m5.instantiate() + system.create_initial_bfs_update(init_addr, init_value) exit_event = m5.simulate() - print(f"Exited simulation at tick {m5.curTick()} because {exit_event.getCause()}") + print(f"Exited simulation at tick {m5.curTick()} " + \ + f"because {exit_event.getCause()}") diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py index aad2e060d1..aea76db86f 100644 --- a/src/accl/graph/sega/MPU.py +++ b/src/accl/graph/sega/MPU.py @@ -36,7 +36,8 @@ class MPU(ClockedObject): system = Param.System(Parent.any, "System this MPU is a part of") - in_port = ResponsePort("Port to receive updates from outside") + in_ports = VectorResponsePort("Incoming Ports to receive updates from " + "remote outside") out_ports = VectorRequestPort("Outgoing ports to all MPUs") @@ -47,5 +48,5 @@ class MPU(ClockedObject): push_engine = Param.PushEngine(NULL, "Internal PushEngine for each " "instance of MPU object.") - update_queue_size = Param.Int(16, "Maximum number of entries " + update_queue_size = Param.Int("Maximum number of entries " "for each update queue.") diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc index f86c7e02b7..4a80b22979 100644 --- a/src/accl/graph/sega/mpu.cc +++ b/src/accl/graph/sega/mpu.cc @@ -42,7 +42,6 @@ MPU::MPU(const Params& params): wlEngine(params.wl_engine), coalesceEngine(params.coalesce_engine), pushEngine(params.push_engine), - inPort(name() + ".inPort", this), updateQueueSize(params.update_queue_size), nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name()) { @@ -53,16 +52,21 @@ MPU::MPU(const Params& params): for (int i = 0; i < params.port_out_ports_connection_count; ++i) { outPorts.emplace_back( - name() + ".outPorts" + std::to_string(i), this, i); + name() + ".out_ports" + std::to_string(i), this, i); updateQueues.emplace_back(); } + + for (int i = 0; i < params.port_in_ports_connection_count; ++i) { + inPorts.emplace_back( + name() + ".in_ports" + std::to_string(i), this, i); + } } Port& MPU::getPort(const std::string& if_name, PortID idx) { - if (if_name == "in_port") { - return inPort; + if (if_name == "in_ports") { + return inPorts[idx]; } else if (if_name == "out_ports") { return outPorts[idx]; } else { @@ -74,9 +78,11 @@ void MPU::init() { localAddrRange = getAddrRanges(); - inPort.sendRangeChange(); + for (int i = 0; i < inPorts.size(); i++){ + inPorts[i].sendRangeChange(); + } for (int i = 0; i < outPorts.size(); i++){ - portAddrMap[outPorts[i].id()] = getAddrRanges(); + portAddrMap[outPorts[i].id()] = outPorts[i].getAddrRanges(); } } @@ -101,6 +107,14 @@ MPU::RespPort::checkRetryReq() } } +void +MPU::checkRetryReq() +{ + for (int i = 0; i < inPorts.size(); ++i) { + inPorts[i].checkRetryReq(); + } +} + bool MPU::RespPort::recvTimingReq(PacketPtr pkt) { @@ -197,16 +211,13 @@ MPU::enqueueUpdate(Update update) for (auto range : localAddrRange) { found_locally |= range.contains(dst_addr); } - DPRINTF(MPU, "%s: TESSSSTSSSS %d, %d, %llu.\n", - __func__, outPorts.size(), updateQueues[0].size(), dst_addr); for (int i = 0; i < outPorts.size(); i++) { AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()]; for (auto range : addr_range_list) { if (range.contains(dst_addr)) { - if (updateQueues[i].size() < updateQueueSize) { - DPRINTF(MPU, "%s: Queue %d received an update.\n", - __func__, i); - updateQueues[i].emplace_back(update, curTick()); + if (updateQueues[outPorts[i].id()].size() < updateQueueSize) { + DPRINTF(MPU, "%s: Queue %d received an update.\n", __func__, i); + updateQueues[outPorts[i].id()].emplace_back(update, curTick()); accepted = true; break; } @@ -268,8 +279,6 @@ MPU::processNextUpdatePushEvent() if (next_time_send > 0) { schedule(nextUpdatePushEvent, nextCycle()); } - - } void diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index 1a642e7873..ff17eada0e 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -56,13 +56,16 @@ class MPU : public ClockedObject private: MPU* owner; bool needSendRetryReq; + PortID _id; public: - RespPort(const std::string& name, MPU* owner): - ResponsePort(name, owner), owner(owner), needSendRetryReq(false) + RespPort(const std::string& name, MPU* owner, PortID id): + ResponsePort(name, owner), + owner(owner), needSendRetryReq(false), _id(id) {} virtual AddrRangeList getAddrRanges() const; + PortID id() { return _id; } void checkRetryReq(); protected: @@ -100,18 +103,16 @@ class MPU : public ClockedObject CoalesceEngine* coalesceEngine; PushEngine* pushEngine; - RespPort inPort; - AddrRangeList localAddrRange; uint32_t updateQueueSize; std::unordered_map portAddrMap; + std::vector inPorts; std::vector outPorts; std::vector>> updateQueues; - template PacketPtr createUpdatePacket(Addr addr, T value); EventFunctionWrapper nextUpdatePushEvent; @@ -129,7 +130,6 @@ class MPU : public ClockedObject void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); } bool handleIncomingUpdate(PacketPtr pkt); - void checkRetryReq() { inPort.checkRetryReq(); } void handleIncomingWL(Addr addr, WorkListItem wl); bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); } void recvWLWrite(Addr addr, WorkListItem wl); @@ -142,6 +142,7 @@ class MPU : public ClockedObject void recvVertexPush(Addr addr, WorkListItem wl); void recvReqRetry(); + void checkRetryReq(); void recvDoneSignal(); bool done(); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 5d4dd1723e..0267bd46b6 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -74,7 +74,7 @@ WLEngine::reduce(uint32_t update, uint32_t value) bool WLEngine::handleIncomingUpdate(PacketPtr pkt) { - assert(updateQueue.size() <= updateQueueSize); + assert((updateQueueSize == 0) || (updateQueue.size() <= updateQueueSize)); if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) { return false; } From cddd042f6330e0da3e36dc2f278898944eb30d31 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Mon, 3 Oct 2022 09:06:36 -0700 Subject: [PATCH 178/247] Moving reqPorts from MPU to PushEngine --- configs/accl/sega.py | 10 +- src/accl/graph/sega/MPU.py | 4 - src/accl/graph/sega/PushEngine.py | 7 +- src/accl/graph/sega/mpu.cc | 136 +------------------------ src/accl/graph/sega/mpu.hh | 36 ------- src/accl/graph/sega/push_engine.cc | 154 ++++++++++++++++++++++++++++- src/accl/graph/sega/push_engine.hh | 36 +++++++ 7 files changed, 200 insertions(+), 183 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 455d081145..21a041180f 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -61,7 +61,8 @@ def __init__(self, edge_memory_size: str, cache_size: str): self.push_engine = PushEngine( push_req_queue_size=32, attached_memory_atom_size=64, - resp_queue_size=64 + resp_queue_size=64, + update_queue_size=16 ) self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2)) @@ -78,8 +79,7 @@ def __init__(self, edge_memory_size: str, cache_size: str): self.mpu = MPU( wl_engine=self.wl_engine, coalesce_engine=self.coalesce_engine, - push_engine=self.push_engine, - update_queue_size=16 + push_engine=self.push_engine ) def getRespPort(self): @@ -88,9 +88,9 @@ def setRespPort(self, port): self.mpu.in_ports = port def getReqPort(self): - return self.mpu.out_ports + return self.push_engine.out_ports def setReqPort(self, port): - self.mpu.out_ports = port + self.push_engine.out_ports = port def set_vertex_range(self, vertex_range): self.vertex_mem_ctrl.dram.range = vertex_range diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py index aea76db86f..3547cb8817 100644 --- a/src/accl/graph/sega/MPU.py +++ b/src/accl/graph/sega/MPU.py @@ -39,8 +39,6 @@ class MPU(ClockedObject): in_ports = VectorResponsePort("Incoming Ports to receive updates from " "remote outside") - out_ports = VectorRequestPort("Outgoing ports to all MPUs") - wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of " "MPU object.") coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for " @@ -48,5 +46,3 @@ class MPU(ClockedObject): push_engine = Param.PushEngine(NULL, "Internal PushEngine for each " "instance of MPU object.") - update_queue_size = Param.Int("Maximum number of entries " - "for each update queue.") diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py index 7dba86aff2..5e0d2b3212 100644 --- a/src/accl/graph/sega/PushEngine.py +++ b/src/accl/graph/sega/PushEngine.py @@ -34,6 +34,8 @@ class PushEngine(BaseMemoryEngine): cxx_header = "accl/graph/sega/push_engine.hh" cxx_class = 'gem5::PushEngine' + workload = Param.String("BFS", "Name of the workload.") + push_req_queue_size = Param.Int("Size of the queue to " "queue push requests.") # resp_queue_size should probably be @@ -45,4 +47,7 @@ class PushEngine(BaseMemoryEngine): max_propagates_per_cycle = Param.Int(4, "Maximum number of propagates " "done per cycle.") - workload = Param.String("BFS", "Name of the workload.") + update_queue_size = Param.Int("Maximum number of entries " + "for each update queue.") + + out_ports = VectorRequestPort("Outgoing ports to all MPUs") diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc index 4a80b22979..76d7d3114f 100644 --- a/src/accl/graph/sega/mpu.cc +++ b/src/accl/graph/sega/mpu.cc @@ -41,21 +41,12 @@ MPU::MPU(const Params& params): system(params.system), wlEngine(params.wl_engine), coalesceEngine(params.coalesce_engine), - pushEngine(params.push_engine), - updateQueueSize(params.update_queue_size), - nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name()) + pushEngine(params.push_engine) { wlEngine->registerMPU(this); coalesceEngine->registerMPU(this); pushEngine->registerMPU(this); - - for (int i = 0; i < params.port_out_ports_connection_count; ++i) { - outPorts.emplace_back( - name() + ".out_ports" + std::to_string(i), this, i); - updateQueues.emplace_back(); - } - for (int i = 0; i < params.port_in_ports_connection_count; ++i) { inPorts.emplace_back( name() + ".in_ports" + std::to_string(i), this, i); @@ -67,8 +58,6 @@ MPU::getPort(const std::string& if_name, PortID idx) { if (if_name == "in_ports") { return inPorts[idx]; - } else if (if_name == "out_ports") { - return outPorts[idx]; } else { return ClockedObject::getPort(if_name, idx); } @@ -77,13 +66,9 @@ MPU::getPort(const std::string& if_name, PortID idx) void MPU::init() { - localAddrRange = getAddrRanges(); for (int i = 0; i < inPorts.size(); i++){ inPorts[i].sendRangeChange(); } - for (int i = 0; i < outPorts.size(); i++){ - portAddrMap[outPorts[i].id()] = outPorts[i].getAddrRanges(); - } } void @@ -144,46 +129,6 @@ MPU::RespPort::recvRespRetry() panic("recvRespRetry from response port is called."); } -void -MPU::ReqPort::sendPacket(PacketPtr pkt) -{ - panic_if(blockedPacket != nullptr, - "Should never try to send if blocked!"); - // If we can't send the packet across the port, store it for later. - if (!sendTimingReq(pkt)) - { - blockedPacket = pkt; - } -} - -bool -MPU::ReqPort::recvTimingResp(PacketPtr pkt) -{ - panic("recvTimingResp called on the request port."); -} - -void -MPU::ReqPort::recvReqRetry() -{ - panic_if(blockedPacket == nullptr, - "Received retry without a blockedPacket."); - - PacketPtr pkt = blockedPacket; - blockedPacket = nullptr; - sendPacket(pkt); - if (blockedPacket == nullptr) { - owner->recvReqRetry(); - } -} - -void -MPU::recvReqRetry() -{ - if (!nextUpdatePushEvent.scheduled()) { - schedule(nextUpdatePushEvent, nextCycle()); - } -} - bool MPU::handleIncomingUpdate(PacketPtr pkt) { @@ -202,85 +147,6 @@ MPU::recvWLWrite(Addr addr, WorkListItem wl) coalesceEngine->recvWLWrite(addr, wl); } -bool -MPU::enqueueUpdate(Update update) -{ - Addr dst_addr = update.dst; - bool found_locally = false; - bool accepted = false; - for (auto range : localAddrRange) { - found_locally |= range.contains(dst_addr); - } - for (int i = 0; i < outPorts.size(); i++) { - AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()]; - for (auto range : addr_range_list) { - if (range.contains(dst_addr)) { - if (updateQueues[outPorts[i].id()].size() < updateQueueSize) { - DPRINTF(MPU, "%s: Queue %d received an update.\n", __func__, i); - updateQueues[outPorts[i].id()].emplace_back(update, curTick()); - accepted = true; - break; - } - } - } - } - - if (accepted && (!nextUpdatePushEvent.scheduled())) { - schedule(nextUpdatePushEvent, nextCycle()); - } - - return accepted; -} - -template PacketPtr -MPU::createUpdatePacket(Addr addr, T value) -{ - RequestPtr req = std::make_shared(addr, sizeof(T), 0, 0); - // Dummy PC to have PC-based prefetchers latch on; get entropy into higher - // bits - req->setPC(((Addr) 1) << 2); - - // FIXME: MemCmd::UpdateWL - PacketPtr pkt = new Packet(req, MemCmd::UpdateWL); - - pkt->allocate(); - // pkt->setData(data); - pkt->setLE(value); - - return pkt; -} - -void -MPU::processNextUpdatePushEvent() -{ - int next_time_send = 0; - - for (int i = 0; i < updateQueues.size(); i++) { - if (updateQueues[i].empty()) { - continue; - } - if (outPorts[i].blocked()) { - continue; - } - Update update; - Tick entrance_tick; - std::tie(update, entrance_tick) = updateQueues[i].front(); - PacketPtr pkt = createUpdatePacket(update.dst, update.value); - outPorts[i].sendPacket(pkt); - DPRINTF(MPU, "%s: Sent update from addr: %lu to addr: %lu with value: " - "%d.\n", __func__, update.src, update.dst, update.value); - updateQueues[i].pop_front(); - if (updateQueues[i].size() > 0) { - next_time_send += 1; - } - } - - assert(!nextUpdatePushEvent.scheduled()); - if (next_time_send > 0) { - schedule(nextUpdatePushEvent, nextCycle()); - } -} - void MPU::recvVertexPush(Addr addr, WorkListItem wl) { diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index ff17eada0e..4215f82d5b 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -75,27 +75,6 @@ class MPU : public ClockedObject virtual void recvRespRetry(); }; - class ReqPort : public RequestPort - { - private: - MPU* owner; - PacketPtr blockedPacket; - PortID _id; - - public: - ReqPort(const std::string& name, MPU* owner, PortID id) : - RequestPort(name, owner), - owner(owner), blockedPacket(nullptr), _id(id) - {} - void sendPacket(PacketPtr pkt); - bool blocked() { return (blockedPacket != nullptr); } - PortID id() { return _id; } - - protected: - virtual bool recvTimingResp(PacketPtr pkt); - virtual void recvReqRetry(); - }; - System* system; CenteralController* centeralController; @@ -103,20 +82,7 @@ class MPU : public ClockedObject CoalesceEngine* coalesceEngine; PushEngine* pushEngine; - AddrRangeList localAddrRange; - - uint32_t updateQueueSize; - - std::unordered_map portAddrMap; - std::vector inPorts; - std::vector outPorts; - std::vector>> updateQueues; - - template PacketPtr createUpdatePacket(Addr addr, T value); - - EventFunctionWrapper nextUpdatePushEvent; - void processNextUpdatePushEvent(); public: PARAMS(MPU); @@ -133,7 +99,6 @@ class MPU : public ClockedObject void handleIncomingWL(Addr addr, WorkListItem wl); bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); } void recvWLWrite(Addr addr, WorkListItem wl); - bool enqueueUpdate(Update update); int workCount() { return coalesceEngine->workCount(); } void recvVertexPull() { return coalesceEngine->recvVertexPull(); } @@ -141,7 +106,6 @@ class MPU : public ClockedObject void start() { return pushEngine->start(); } void recvVertexPush(Addr addr, WorkListItem wl); - void recvReqRetry(); void checkRetryReq(); void recvDoneSignal(); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index d533f1ea79..70c10cc358 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -44,11 +44,40 @@ PushEngine::PushEngine(const Params& params): onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size), maxPropagatesPerCycle(params.max_propagates_per_cycle), workload(params.workload), + updateQueueSize(params.update_queue_size), nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()), nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()), nextPropagateEvent([this] { processNextPropagateEvent(); }, name()), + nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name()), stats(*this) -{} +{ + for (int i = 0; i < params.port_out_ports_connection_count; ++i) { + outPorts.emplace_back( + name() + ".out_ports" + std::to_string(i), this, i); + updateQueues.emplace_back(); + } +} + +Port& +PushEngine::getPort(const std::string& if_name, PortID idx) +{ + if (if_name == "out_ports") { + return outPorts[idx]; + } else if (if_name == "mem_port") { + return BaseMemoryEngine::getPort(if_name, idx); + } else { + return ClockedObject::getPort(if_name, idx); + } +} + +void +PushEngine::init() +{ + localAddrRange = owner->getAddrRanges(); + for (int i = 0; i < outPorts.size(); i++){ + portAddrMap[outPorts[i].id()] = outPorts[i].getAddrRanges(); + } +} void PushEngine::registerMPU(MPU* mpu) @@ -56,6 +85,46 @@ PushEngine::registerMPU(MPU* mpu) owner = mpu; } +void +PushEngine::ReqPort::sendPacket(PacketPtr pkt) +{ + panic_if(blockedPacket != nullptr, + "Should never try to send if blocked!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + blockedPacket = pkt; + } +} + +bool +PushEngine::ReqPort::recvTimingResp(PacketPtr pkt) +{ + panic("recvTimingResp called on the request port."); +} + +void +PushEngine::ReqPort::recvReqRetry() +{ + panic_if(blockedPacket == nullptr, + "Received retry without a blockedPacket."); + + PacketPtr pkt = blockedPacket; + blockedPacket = nullptr; + sendPacket(pkt); + if (blockedPacket == nullptr) { + owner->recvReqRetry(); + } +} + +void +PushEngine::recvReqRetry() +{ + if (!nextUpdatePushEvent.scheduled()) { + schedule(nextUpdatePushEvent, nextCycle()); + } +} + bool PushEngine::vertexSpace() { @@ -255,7 +324,7 @@ PushEngine::processNextPropagateEvent() Update update(meta_edge.src, meta_edge.dst, update_value); edge_list.pop_front(); - if (owner->enqueueUpdate(update)) { + if (enqueueUpdate(update)) { DPRINTF(PushEngine, "%s: Sending %s to port queues.\n", __func__, meta_edge.to_string()); stats.numUpdates++; @@ -285,6 +354,87 @@ PushEngine::processNextPropagateEvent() } } +bool +PushEngine::enqueueUpdate(Update update) +{ + Addr dst_addr = update.dst; + bool found_locally = false; + bool accepted = false; + for (auto range : localAddrRange) { + found_locally |= range.contains(dst_addr); + } + for (int i = 0; i < outPorts.size(); i++) { + AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()]; + for (auto range : addr_range_list) { + if (range.contains(dst_addr)) { + if (updateQueues[outPorts[i].id()].size() < updateQueueSize) { + DPRINTF(PushEngine, "%s: Queue %d received an update.\n", __func__, i); + updateQueues[outPorts[i].id()].emplace_back(update, curTick()); + DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size()); + accepted = true; + break; + } + } + } + } + + if (accepted && (!nextUpdatePushEvent.scheduled())) { + schedule(nextUpdatePushEvent, nextCycle()); + } + + return accepted; +} + +template PacketPtr +PushEngine::createUpdatePacket(Addr addr, T value) +{ + RequestPtr req = std::make_shared(addr, sizeof(T), 0, 0); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) 1) << 2); + + // FIXME: MemCmd::UpdateWL + PacketPtr pkt = new Packet(req, MemCmd::UpdateWL); + + pkt->allocate(); + // pkt->setData(data); + pkt->setLE(value); + + return pkt; +} + +void +PushEngine::processNextUpdatePushEvent() +{ + int next_time_send = 0; + + for (int i = 0; i < updateQueues.size(); i++) { + if (updateQueues[i].empty()) { + continue; + } + if (outPorts[i].blocked()) { + continue; + } + Update update; + Tick entrance_tick; + std::tie(update, entrance_tick) = updateQueues[i].front(); + PacketPtr pkt = createUpdatePacket(update.dst, update.value); + outPorts[i].sendPacket(pkt); + DPRINTF(PushEngine, "%s: Sent update from addr: %lu to addr: %lu with value: " + "%d.\n", __func__, update.src, update.dst, update.value); + updateQueues[i].pop_front(); + DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size()); + if (updateQueues[i].size() > 0) { + next_time_send += 1; + } + } + + assert(!nextUpdatePushEvent.scheduled()); + if (next_time_send > 0) { + schedule(nextUpdatePushEvent, nextCycle()); + } +} + PushEngine::PushStats::PushStats(PushEngine &_push) : statistics::Group(&_push), push(_push), diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index fed6909733..99fec33f2c 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -42,6 +42,27 @@ class MPU; class PushEngine : public BaseMemoryEngine { private: + class ReqPort : public RequestPort + { + private: + PushEngine* owner; + PacketPtr blockedPacket; + PortID _id; + + public: + ReqPort(const std::string& name, PushEngine* owner, PortID id) : + RequestPort(name, owner), + owner(owner), blockedPacket(nullptr), _id(id) + {} + void sendPacket(PacketPtr pkt); + bool blocked() { return (blockedPacket != nullptr); } + PortID id() { return _id; } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + class EdgeReadInfoGen { private: Addr _start; @@ -95,6 +116,8 @@ class PushEngine : public BaseMemoryEngine bool _running; Tick lastIdleEntranceTick; + AddrRangeList localAddrRange; + int numPendingPulls; int edgePointerQueueSize; std::deque edgePointerQueue; @@ -108,6 +131,13 @@ class PushEngine : public BaseMemoryEngine std::string workload; uint32_t propagate(uint32_t value, uint32_t weight); + int updateQueueSize; + std::vector>> updateQueues; + template PacketPtr createUpdatePacket(Addr addr, T value); + bool enqueueUpdate(Update update); + std::unordered_map portAddrMap; + std::vector outPorts; + bool vertexSpace(); bool workLeft(); @@ -120,6 +150,9 @@ class PushEngine : public BaseMemoryEngine EventFunctionWrapper nextPropagateEvent; void processNextPropagateEvent(); + EventFunctionWrapper nextUpdatePushEvent; + void processNextUpdatePushEvent(); + struct PushStats : public statistics::Group { PushStats(PushEngine &push); @@ -147,6 +180,9 @@ class PushEngine : public BaseMemoryEngine public: PARAMS(PushEngine); PushEngine(const Params& params); + Port& getPort(const std::string& if_name, + PortID idx = InvalidPortID) override; + virtual void init() override; void registerMPU(MPU* mpu); virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } From d2e6f2e7119437f6762f03cf93f85bdb0beb67b5 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Mon, 3 Oct 2022 10:01:32 -0700 Subject: [PATCH 179/247] Moving respPorts from MPU to WLEngine --- configs/accl/sega.py | 4 +- src/accl/graph/sega/MPU.py | 7 +-- src/accl/graph/sega/WLEngine.py | 6 ++- src/accl/graph/sega/mpu.cc | 79 ++------------------------- src/accl/graph/sega/mpu.hh | 39 ++------------ src/accl/graph/sega/wl_engine.cc | 93 ++++++++++++++++++++++++++++++-- src/accl/graph/sega/wl_engine.hh | 34 ++++++++++++ 7 files changed, 140 insertions(+), 122 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 21a041180f..c6c2171315 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -83,9 +83,9 @@ def __init__(self, edge_memory_size: str, cache_size: str): ) def getRespPort(self): - return self.mpu.in_ports + return self.wl_engine.in_ports def setRespPort(self, port): - self.mpu.in_ports = port + self.wl_engine.in_ports = port def getReqPort(self): return self.push_engine.out_ports diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py index 3547cb8817..8d2453b01c 100644 --- a/src/accl/graph/sega/MPU.py +++ b/src/accl/graph/sega/MPU.py @@ -27,18 +27,15 @@ from m5.params import * from m5.proxy import * -from m5.objects.ClockedObject import ClockedObject +from m5.SimObject import SimObject -class MPU(ClockedObject): +class MPU(SimObject): type = "MPU" cxx_header = "accl/graph/sega/mpu.hh" cxx_class = "gem5::MPU" system = Param.System(Parent.any, "System this MPU is a part of") - in_ports = VectorResponsePort("Incoming Ports to receive updates from " - "remote outside") - wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of " "MPU object.") coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for " diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py index a44352ab9b..91325ab53f 100644 --- a/src/accl/graph/sega/WLEngine.py +++ b/src/accl/graph/sega/WLEngine.py @@ -34,11 +34,15 @@ class WLEngine(BaseReduceEngine): cxx_header = "accl/graph/sega/wl_engine.hh" cxx_class = 'gem5::WLEngine' + in_ports = VectorResponsePort("Incoming Ports to receive updates from " + "remote outside") + update_queue_size = Param.Int("Size of the queue WLEngine stores " "the incoming updates") + register_file_size = Param.Int("Number of internal registers the " "WLEngine has. It can service as " "many updates as this queueu has " - "entries at the same time.") # 4 is arbitrary + "entries at the same time.") workload = Param.String('BFS',"Name of the workload") \ No newline at end of file diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc index 76d7d3114f..c8d0f636f2 100644 --- a/src/accl/graph/sega/mpu.cc +++ b/src/accl/graph/sega/mpu.cc @@ -29,7 +29,6 @@ #include "accl/graph/sega/mpu.hh" #include "accl/graph/sega/centeral_controller.hh" -#include "debug/MPU.hh" #include "mem/packet_access.hh" #include "sim/sim_exit.hh" @@ -37,7 +36,7 @@ namespace gem5 { MPU::MPU(const Params& params): - ClockedObject(params), + SimObject(params), system(params.system), wlEngine(params.wl_engine), coalesceEngine(params.coalesce_engine), @@ -46,30 +45,10 @@ MPU::MPU(const Params& params): wlEngine->registerMPU(this); coalesceEngine->registerMPU(this); pushEngine->registerMPU(this); - - for (int i = 0; i < params.port_in_ports_connection_count; ++i) { - inPorts.emplace_back( - name() + ".in_ports" + std::to_string(i), this, i); - } -} - -Port& -MPU::getPort(const std::string& if_name, PortID idx) -{ - if (if_name == "in_ports") { - return inPorts[idx]; - } else { - return ClockedObject::getPort(if_name, idx); - } } -void -MPU::init() -{ - for (int i = 0; i < inPorts.size(); i++){ - inPorts[i].sendRangeChange(); - } -} +MPU::~MPU() +{} void MPU::registerCenteralController(CenteralController* centeral_controller) @@ -77,58 +56,6 @@ MPU::registerCenteralController(CenteralController* centeral_controller) centeralController = centeral_controller; } -AddrRangeList -MPU::RespPort::getAddrRanges() const -{ - return owner->getAddrRanges(); -} - -void -MPU::RespPort::checkRetryReq() -{ - if (needSendRetryReq) { - sendRetryReq(); - needSendRetryReq = false; - } -} - -void -MPU::checkRetryReq() -{ - for (int i = 0; i < inPorts.size(); ++i) { - inPorts[i].checkRetryReq(); - } -} - -bool -MPU::RespPort::recvTimingReq(PacketPtr pkt) -{ - if (!owner->handleIncomingUpdate(pkt)) { - needSendRetryReq = true; - return false; - } - - return true; -} - -Tick -MPU::RespPort::recvAtomic(PacketPtr pkt) -{ - panic("recvAtomic unimpl."); -} - -void -MPU::RespPort::recvFunctional(PacketPtr pkt) -{ - owner->recvFunctional(pkt); -} - -void -MPU::RespPort::recvRespRetry() -{ - panic("recvRespRetry from response port is called."); -} - bool MPU::handleIncomingUpdate(PacketPtr pkt) { diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index 4215f82d5b..a1e5055226 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -38,8 +38,7 @@ #include "accl/graph/sega/wl_engine.hh" #include "base/addr_range.hh" #include "mem/packet.hh" -#include "mem/port.hh" -#include "sim/clocked_object.hh" +#include "sim/sim_object.hh" #include "sim/system.hh" #include "params/MPU.hh" @@ -48,33 +47,9 @@ namespace gem5 class CenteralController; -class MPU : public ClockedObject +class MPU : public SimObject { private: - class RespPort : public ResponsePort - { - private: - MPU* owner; - bool needSendRetryReq; - PortID _id; - - public: - RespPort(const std::string& name, MPU* owner, PortID id): - ResponsePort(name, owner), - owner(owner), needSendRetryReq(false), _id(id) - {} - virtual AddrRangeList getAddrRanges() const; - - PortID id() { return _id; } - void checkRetryReq(); - - protected: - virtual bool recvTimingReq(PacketPtr pkt); - virtual Tick recvAtomic(PacketPtr pkt); - virtual void recvFunctional(PacketPtr pkt); - virtual void recvRespRetry(); - }; - System* system; CenteralController* centeralController; @@ -82,20 +57,16 @@ class MPU : public ClockedObject CoalesceEngine* coalesceEngine; PushEngine* pushEngine; - std::vector inPorts; - public: PARAMS(MPU); MPU(const Params& params); - Port& getPort(const std::string& if_name, - PortID idx = InvalidPortID) override; - virtual void init() override; + ~MPU(); void registerCenteralController(CenteralController* centeral_controller); AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); } void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); } - bool handleIncomingUpdate(PacketPtr pkt); + void handleIncomingWL(Addr addr, WorkListItem wl); bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); } void recvWLWrite(Addr addr, WorkListItem wl); @@ -106,8 +77,6 @@ class MPU : public ClockedObject void start() { return pushEngine->start(); } void recvVertexPush(Addr addr, WorkListItem wl); - void checkRetryReq(); - void recvDoneSignal(); bool done(); }; diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 0267bd46b6..9a548a3255 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -45,7 +45,30 @@ WLEngine::WLEngine(const WLEngineParams& params): nextReadEvent([this]{ processNextReadEvent(); }, name()), nextReduceEvent([this]{ processNextReduceEvent(); }, name()), stats(*this) -{} +{ + for (int i = 0; i < params.port_in_ports_connection_count; ++i) { + inPorts.emplace_back( + name() + ".in_ports" + std::to_string(i), this, i); + } +} + +Port& +WLEngine::getPort(const std::string& if_name, PortID idx) +{ + if (if_name == "in_ports") { + return inPorts[idx]; + } else { + return ClockedObject::getPort(if_name, idx); + } +} + +void +WLEngine::init() +{ + for (int i = 0; i < inPorts.size(); i++){ + inPorts[i].sendRangeChange(); + } +} void WLEngine::registerMPU(MPU* mpu) @@ -53,6 +76,70 @@ WLEngine::registerMPU(MPU* mpu) owner = mpu; } +AddrRangeList +WLEngine::getAddrRanges() +{ + return owner->getAddrRanges(); +} + +void +WLEngine::recvFunctional(PacketPtr pkt) +{ + owner->recvFunctional(pkt); +} + +AddrRangeList +WLEngine::RespPort::getAddrRanges() const +{ + return owner->getAddrRanges(); +} + +void +WLEngine::RespPort::checkRetryReq() +{ + if (needSendRetryReq) { + sendRetryReq(); + needSendRetryReq = false; + } +} + +bool +WLEngine::RespPort::recvTimingReq(PacketPtr pkt) +{ + if (!owner->handleIncomingUpdate(pkt)) { + needSendRetryReq = true; + return false; + } + + return true; +} + +Tick +WLEngine::RespPort::recvAtomic(PacketPtr pkt) +{ + panic("recvAtomic unimpl."); +} + +void +WLEngine::RespPort::recvFunctional(PacketPtr pkt) +{ + owner->recvFunctional(pkt); +} + +void +WLEngine::RespPort::recvRespRetry() +{ + panic("recvRespRetry from response port is called."); +} + +void +WLEngine::checkRetryReq() +{ + for (int i = 0; i < inPorts.size(); ++i) { + inPorts[i].checkRetryReq(); + } +} + bool WLEngine::done() { @@ -144,7 +231,7 @@ WLEngine::processNextReadEvent() "from updateQueue. updateQueue.size = %d. " "updateQueueSize = %d.\n", __func__, update_addr, update_value, updateQueue.size(), updateQueueSize); - owner->checkRetryReq(); + checkRetryReq(); vertexReadTime[update_addr] = curTick(); } } else { @@ -173,7 +260,7 @@ WLEngine::processNextReadEvent() "from updateQueue. updateQueue.size = %d. " "updateQueueSize = %d.\n", __func__, update_addr, update_value, updateQueue.size(), updateQueueSize); - owner->checkRetryReq(); + checkRetryReq(); } if (!updateQueue.empty() && (!nextReadEvent.scheduled())) { diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index f888979be9..5f08678d26 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -45,8 +45,34 @@ class MPU; class WLEngine : public BaseReduceEngine { private: + class RespPort : public ResponsePort + { + private: + WLEngine* owner; + bool needSendRetryReq; + PortID _id; + + public: + RespPort(const std::string& name, WLEngine* owner, PortID id): + ResponsePort(name, owner), + owner(owner), needSendRetryReq(false), _id(id) + {} + virtual AddrRangeList getAddrRanges() const; + + PortID id() { return _id; } + void checkRetryReq(); + + protected: + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt); + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRespRetry(); + }; + MPU* owner; + std::vector inPorts; + int updateQueueSize; std::deque> updateQueue; @@ -86,11 +112,19 @@ class WLEngine : public BaseReduceEngine public: PARAMS(WLEngine); WLEngine(const Params& params); + Port& getPort(const std::string& if_name, + PortID idx = InvalidPortID) override; + virtual void init() override; void registerMPU(MPU* mpu); + AddrRangeList getAddrRanges(); + void recvFunctional(PacketPtr pkt); + bool handleIncomingUpdate(PacketPtr pkt); void handleIncomingWL(Addr addr, WorkListItem wl); + void checkRetryReq(); + bool done(); }; From 07cfd5fbb3381f8be86224e491c0eb0dc5d9da97 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 3 Oct 2022 12:58:25 -0700 Subject: [PATCH 180/247] Updating dprintfs. --- src/accl/graph/sega/push_engine.cc | 50 ++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 70c10cc358..9039eb408d 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -109,10 +109,12 @@ PushEngine::ReqPort::recvReqRetry() panic_if(blockedPacket == nullptr, "Received retry without a blockedPacket."); + DPRINTF(PushEngine, "%s: ReqPort %d received a reqRetry. blockedPacket: %s.\n", __func__, _id, blockedPacket->print()); PacketPtr pkt = blockedPacket; blockedPacket = nullptr; sendPacket(pkt); if (blockedPacket == nullptr) { + DPRINTF(PushEngine, "%s: blockedPacket sent successfully.\n", __func__); owner->recvReqRetry(); } } @@ -120,6 +122,7 @@ PushEngine::ReqPort::recvReqRetry() void PushEngine::recvReqRetry() { + DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__); if (!nextUpdatePushEvent.scheduled()) { schedule(nextUpdatePushEvent, nextCycle()); } @@ -325,7 +328,7 @@ PushEngine::processNextPropagateEvent() edge_list.pop_front(); if (enqueueUpdate(update)) { - DPRINTF(PushEngine, "%s: Sending %s to port queues.\n", + DPRINTF(PushEngine, "%s: Sent %s to port queues.\n", __func__, meta_edge.to_string()); stats.numUpdates++; stats.edgeQueueLatency.sample( @@ -363,14 +366,17 @@ PushEngine::enqueueUpdate(Update update) for (auto range : localAddrRange) { found_locally |= range.contains(dst_addr); } + DPRINTF(PushEngine, "%s: Received update: %s.\n", __func__, update.to_string()); for (int i = 0; i < outPorts.size(); i++) { AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()]; for (auto range : addr_range_list) { if (range.contains(dst_addr)) { + DPRINTF(PushEngine, "%s: Update: %s belongs to port %d.\n", __func__, update.to_string(), outPorts[i].id()); + DPRINTF(PushEngine, "%s: There are %d updates already in queue for port %d.\n", __func__, updateQueues[outPorts[i].id()].size(), outPorts[i].id()); if (updateQueues[outPorts[i].id()].size() < updateQueueSize) { - DPRINTF(PushEngine, "%s: Queue %d received an update.\n", __func__, i); + DPRINTF(PushEngine, "%s: There is a free entry available in queue %d.\n", __func__, outPorts[i].id()); updateQueues[outPorts[i].id()].emplace_back(update, curTick()); - DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size()); + DPRINTF(PushEngine, "%s: Emplaced the update at the back of queue for port %d is. Size of queue for port %d is %d.\n", __func__, outPorts[i].id(), outPorts[i].id(), updateQueues[outPorts[i].id()].size()); accepted = true; break; } @@ -408,23 +414,47 @@ PushEngine::processNextUpdatePushEvent() { int next_time_send = 0; - for (int i = 0; i < updateQueues.size(); i++) { - if (updateQueues[i].empty()) { + // for (int i = 0; i < updateQueues.size(); i++) { + // if (updateQueues[i].empty()) { + // continue; + // } + // if (outPorts[i].blocked()) { + // continue; + // } + // Update update; + // Tick entrance_tick; + // std::tie(update, entrance_tick) = updateQueues[i].front(); + // PacketPtr pkt = createUpdatePacket(update.dst, update.value); + // outPorts[i].sendPacket(pkt); + // DPRINTF(PushEngine, "%s: Sent update from addr: %lu to addr: %lu with value: " + // "%d.\n", __func__, update.src, update.dst, update.value); + // updateQueues[i].pop_front(); + // DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size()); + // if (updateQueues[i].size() > 0) { + // next_time_send += 1; + // } + // } + + for (int i = 0; i < outPorts.size(); i++) { + if (outPorts[i].blocked()) { + DPRINTF(PushEngine, "%s: Port %d blocked.\n", __func__, outPorts[i].id()); continue; } - if (outPorts[i].blocked()) { + DPRINTF(PushEngine, "%s: Port %d available.\n", __func__, outPorts[i].id()); + if (updateQueues[outPorts[i].id()].empty()) { + DPRINTF(PushEngine, "%s: Respective queue for port %d is empty.\n", __func__, outPorts[i].id()); continue; } + DPRINTF(PushEngine, "%s: Respective queue for port %d not empty.\n", __func__, outPorts[i].id()); Update update; Tick entrance_tick; std::tie(update, entrance_tick) = updateQueues[i].front(); PacketPtr pkt = createUpdatePacket(update.dst, update.value); outPorts[i].sendPacket(pkt); - DPRINTF(PushEngine, "%s: Sent update from addr: %lu to addr: %lu with value: " - "%d.\n", __func__, update.src, update.dst, update.value); - updateQueues[i].pop_front(); + DPRINTF(PushEngine, "%s: Sent update: %s from queue %d to port %d.\n", __func__, outPorts[i].id(), outPorts[i].id()); + updateQueues[outPorts[i].id()].pop_front(); DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size()); - if (updateQueues[i].size() > 0) { + if (updateQueues[outPorts[i].id()].size() > 0) { next_time_send += 1; } } From bab798ddaa2384e934ebc1775ac5755f83affdc8 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Tue, 4 Oct 2022 12:49:29 -0700 Subject: [PATCH 181/247] Fixing the problems with retry --- configs/accl/sega.py | 6 +++--- src/accl/graph/sega/push_engine.cc | 8 ++++---- src/accl/graph/sega/push_engine.hh | 3 ++- src/accl/graph/sega/wl_engine.cc | 2 +- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index c6c2171315..6b198c5f4a 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -48,8 +48,8 @@ class GPT(SubSystem): def __init__(self, edge_memory_size: str, cache_size: str): super().__init__() self.wl_engine = WLEngine( - update_queue_size=64, - register_file_size=32 + update_queue_size=2, + register_file_size=2 ) self.coalesce_engine = CoalesceEngine( attached_memory_atom_size=32, @@ -62,7 +62,7 @@ def __init__(self, edge_memory_size: str, cache_size: str): push_req_queue_size=32, attached_memory_atom_size=64, resp_queue_size=64, - update_queue_size=16 + update_queue_size=2 ) self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2)) diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 9039eb408d..238b8a89fb 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -54,7 +54,6 @@ PushEngine::PushEngine(const Params& params): for (int i = 0; i < params.port_out_ports_connection_count; ++i) { outPorts.emplace_back( name() + ".out_ports" + std::to_string(i), this, i); - updateQueues.emplace_back(); } } @@ -93,6 +92,7 @@ PushEngine::ReqPort::sendPacket(PacketPtr pkt) // If we can't send the packet across the port, store it for later. if (!sendTimingReq(pkt)) { + DPRINTF(PushEngine, "%s: Packet is blocked.\n", __func__); blockedPacket = pkt; } } @@ -386,7 +386,7 @@ PushEngine::enqueueUpdate(Update update) if (accepted && (!nextUpdatePushEvent.scheduled())) { schedule(nextUpdatePushEvent, nextCycle()); - } + } return accepted; } @@ -448,10 +448,10 @@ PushEngine::processNextUpdatePushEvent() DPRINTF(PushEngine, "%s: Respective queue for port %d not empty.\n", __func__, outPorts[i].id()); Update update; Tick entrance_tick; - std::tie(update, entrance_tick) = updateQueues[i].front(); + std::tie(update, entrance_tick) = updateQueues[outPorts[i].id()].front(); PacketPtr pkt = createUpdatePacket(update.dst, update.value); outPorts[i].sendPacket(pkt); - DPRINTF(PushEngine, "%s: Sent update: %s from queue %d to port %d.\n", __func__, outPorts[i].id(), outPorts[i].id()); + DPRINTF(PushEngine, "%s: Sent update: %s from queue %d to port %d the queue size is %d.\n", __func__, update.to_string(), outPorts[i].id(), outPorts[i].id(), updateQueues[outPorts[i].id()].size()); updateQueues[outPorts[i].id()].pop_front(); DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size()); if (updateQueues[outPorts[i].id()].size() > 0) { diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 99fec33f2c..4e0cdbc526 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -132,10 +132,11 @@ class PushEngine : public BaseMemoryEngine uint32_t propagate(uint32_t value, uint32_t weight); int updateQueueSize; - std::vector>> updateQueues; + // std::vector>> updateQueues; template PacketPtr createUpdatePacket(Addr addr, T value); bool enqueueUpdate(Update update); std::unordered_map portAddrMap; + std::unordered_map>> updateQueues; std::vector outPorts; bool vertexSpace(); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 9a548a3255..116cdf3f77 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -98,8 +98,8 @@ void WLEngine::RespPort::checkRetryReq() { if (needSendRetryReq) { - sendRetryReq(); needSendRetryReq = false; + sendRetryReq(); } } From 6140135bdc790a77b13d8026292874c3d91154fd Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 4 Oct 2022 14:10:57 -0700 Subject: [PATCH 182/247] Fixing done, code style and conifg. Adding a stat. --- configs/accl/sega-simple.py | 68 ++++++------- configs/accl/sega-single-simple.py | 151 ---------------------------- configs/accl/sega-single.py | 155 ----------------------------- configs/accl/sega.py | 14 +-- src/accl/graph/sega/mpu.cc | 3 - src/accl/graph/sega/mpu.hh | 1 - src/accl/graph/sega/push_engine.cc | 97 ++++++++++-------- src/accl/graph/sega/push_engine.hh | 4 +- 8 files changed, 90 insertions(+), 403 deletions(-) delete mode 100644 configs/accl/sega-single-simple.py delete mode 100644 configs/accl/sega-single.py diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py index fffc273ee1..54a90281bf 100644 --- a/configs/accl/sega-simple.py +++ b/configs/accl/sega-simple.py @@ -48,20 +48,21 @@ class GPT(SubSystem): def __init__(self, edge_memory_size: str, cache_size: str): super().__init__() self.wl_engine = WLEngine( - update_queue_size=64, - register_file_size=32 + update_queue_size=128, + register_file_size=64 ) self.coalesce_engine = CoalesceEngine( attached_memory_atom_size=32, cache_size=cache_size, - num_mshr_entry=32, - num_tgts_per_mshr=32, - max_resp_per_cycle=4 + num_mshr_entry=64, + num_tgts_per_mshr=64, + max_resp_per_cycle=8 ) self.push_engine = PushEngine( push_req_queue_size=32, attached_memory_atom_size=64, - resp_queue_size=64 + resp_queue_size=64, + update_queue_size=16, ) self.vertex_mem_ctrl = SimpleMemory( @@ -88,14 +89,14 @@ def __init__(self, edge_memory_size: str, cache_size: str): ) def getRespPort(self): - return self.mpu.in_port + return self.wl_engine.in_ports def setRespPort(self, port): - self.mpu.in_port = port + self.wl_engine.in_ports = port def getReqPort(self): - return self.mpu.out_port + return self.push_engine.out_ports def setReqPort(self, port): - self.mpu.out_port = port + self.push_engine.out_ports = port def set_vertex_range(self, vertex_range): self.vertex_mem_ctrl.range = vertex_range @@ -103,54 +104,39 @@ def set_edge_image(self, edge_image): self.edge_mem_ctrl.image_file = edge_image class SEGA(System): - def __init__( - self, - num_mpus, - cache_size, - graph_path, - first_addr, - first_value - ): + def __init__(self, num_mpus, cache_size, graph_path): super(SEGA, self).__init__() self.clk_domain = SrcClockDomain() - self.clk_domain.clock = '4GHz' + self.clk_domain.clock = '2GHz' self.clk_domain.voltage_domain = VoltageDomain() self.cache_line_size = 32 self.mem_mode = "timing" - self.interconnect = NoncoherentXBar( - frontend_latency=1, - forward_latency=1, - response_latency=1, - width=64 - ) - - self.ctrl = CenteralController( - init_addr=first_addr, - init_value=first_value, - image_file=f"{graph_path}/vertices" - ) - - self.ctrl.req_port = self.interconnect.cpu_side_ports + self.ctrl = CenteralController(image_file=f"{graph_path}/vertices") vertex_ranges = interleave_addresses( - AddrRange(start=0, size="4GiB"), - num_mpus, - 32 - ) + AddrRange(start=0, size="4GiB"), + num_mpus, + 32 + ) gpts = [] for i in range(num_mpus): gpt = GPT("8GiB", cache_size) gpt.set_vertex_range(vertex_ranges[i]) gpt.set_edge_image(f"{graph_path}/edgelist_{i}") - gpt.setReqPort(self.interconnect.cpu_side_ports) - gpt.setRespPort(self.interconnect.mem_side_ports) gpts.append(gpt) + # Creating the interconnect among mpus + for gpt_0 in gpts: + for gpt_1 in gpts: + gpt_0.setReqPort(gpt_1.getRespPort()) self.gpts = gpts self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] + def create_initial_bfs_update(self, init_addr, init_value): + self.ctrl.createInitialBFSUpdate(init_addr, init_value) + def get_inputs(): argparser = argparse.ArgumentParser() argparser.add_argument("num_gpts", type=int) @@ -167,11 +153,13 @@ def get_inputs(): if __name__ == "__m5_main__": num_gpts, cache_size, graph, init_addr, init_value = get_inputs() - system = SEGA(num_gpts, cache_size, graph, init_addr, init_value) + system = SEGA(num_gpts, cache_size, graph) root = Root(full_system = False, system = system) m5.instantiate() + system.create_initial_bfs_update(init_addr, init_value) + exit_event = m5.simulate() print(f"Exited simulation at tick {m5.curTick()} " + \ f"because {exit_event.getCause()}") diff --git a/configs/accl/sega-single-simple.py b/configs/accl/sega-single-simple.py deleted file mode 100644 index eacb16d3d1..0000000000 --- a/configs/accl/sega-single-simple.py +++ /dev/null @@ -1,151 +0,0 @@ -# Copyright (c) 2022 The Regents of the University of California -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import m5 -import argparse - -from math import log -from m5.objects import * - -def interleave_addresses(plain_range, num_channels, cache_line_size): - intlv_low_bit = log(cache_line_size, 2) - intlv_bits = log(num_channels, 2) - ret = [] - for i in range(num_channels): - ret.append(AddrRange( - start=plain_range.start, - size=plain_range.size(), - intlvHighBit=intlv_low_bit + intlv_bits - 1, - xorHighBit=0, - intlvBits=intlv_bits, - intlvMatch=i)) - return ret - -class GPT(SubSystem): - def __init__(self, edge_memory_size: str, cache_size: str): - super().__init__() - self.wl_engine = WLEngine( - update_queue_size=64, - register_file_size=32 - ) - self.coalesce_engine = CoalesceEngine( - attached_memory_atom_size=32, - cache_size=cache_size, - num_mshr_entry=32, - num_tgts_per_mshr=32, - max_resp_per_cycle=4 - ) - self.push_engine = PushEngine( - push_req_queue_size=32, - attached_memory_atom_size=64, - resp_queue_size=64 - ) - - self.vertex_mem_ctrl = SimpleMemory( - latency="30ns", - latency_var="0ns", - bandwidth="0GB/s" - ) - - self.edge_mem_ctrl = SimpleMemory( - latency="30ns", - latency_var="0ns", - bandwidth="32GB/s", - range=AddrRange(edge_memory_size), - in_addr_map=False - ) - - self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port - self.push_engine.mem_port = self.edge_mem_ctrl.port - - self.mpu = MPU( - wl_engine=self.wl_engine, - coalesce_engine=self.coalesce_engine, - push_engine=self.push_engine - ) - - def getRespPort(self): - return self.mpu.in_ports - def setRespPort(self, port): - self.mpu.in_ports = port - - def setReqPort(self, port): - self.mpu.out_ports = port - def getReqPort(self): - return self.mpu.out_ports - - def set_vertex_range(self, vertex_range): - self.vertex_mem_ctrl.range = vertex_range - - def set_vertex_image(self, vertex_image): - self.vertex_mem_ctrl.image_file = vertex_image - def set_edge_image(self, edge_image): - self.edge_mem_ctrl.image_file = edge_image - -class SEGA(System): - def __init__(self, cache_size, graph_path): - super(SEGA, self).__init__() - self.clk_domain = SrcClockDomain() - self.clk_domain.clock = '2GHz' - self.clk_domain.voltage_domain = VoltageDomain() - self.cache_line_size = 32 - self.mem_mode = "timing" - - gpts = [GPT("8GiB", cache_size)] - gpts[0].set_vertex_range(AddrRange("4GiB")) - gpts[0].set_edge_image(f"{graph_path}/edgelist_0") - gpts[0].setReqPort(gpts[0].getRespPort()) - self.gpts = gpts - - self.ctrl = CenteralController(image_file=f"{graph_path}/vertices") - self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] - - def create_initial_bfs_update(self, init_addr, init_value): - self.ctrl.createInitialBFSUpdate(init_addr, init_value) - -def get_inputs(): - argparser = argparse.ArgumentParser() - argparser.add_argument("cache_size", type=str) - argparser.add_argument("graph", type=str) - argparser.add_argument("init_addr", type=int) - argparser.add_argument("init_value", type=int) - - args = argparser.parse_args() - - return args.cache_size, args.graph, args.init_addr, args.init_value - -if __name__ == "__m5_main__": - cache_size, graph, init_addr, init_value = get_inputs() - - system = SEGA(cache_size, graph) - root = Root(full_system = False, system = system) - - m5.instantiate() - - system.create_initial_bfs_update(init_addr, init_value) - exit_event = m5.simulate() - print(f"Exited simulation at tick {m5.curTick()} " + \ - f"because {exit_event.getCause()}") diff --git a/configs/accl/sega-single.py b/configs/accl/sega-single.py deleted file mode 100644 index e4f7942f42..0000000000 --- a/configs/accl/sega-single.py +++ /dev/null @@ -1,155 +0,0 @@ -# Copyright (c) 2022 The Regents of the University of California -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import m5 -import argparse - -from math import log -from m5.objects import * - -def interleave_addresses(plain_range, num_channels, cache_line_size): - intlv_low_bit = log(cache_line_size, 2) - intlv_bits = log(num_channels, 2) - ret = [] - for i in range(num_channels): - ret.append(AddrRange( - start=plain_range.start, - size=plain_range.size(), - intlvHighBit=intlv_low_bit + intlv_bits - 1, - xorHighBit=0, - intlvBits=intlv_bits, - intlvMatch=i)) - return ret - -class GPT(SubSystem): - def __init__(self, edge_memory_size: str, cache_size: str): - super().__init__() - self.wl_engine = WLEngine( - update_queue_size=64, - register_file_size=32 - ) - self.coalesce_engine = CoalesceEngine( - attached_memory_atom_size=32, - cache_size=cache_size, - num_mshr_entry=32, - num_tgts_per_mshr=32, - max_resp_per_cycle=4 - ) - self.push_engine = PushEngine( - push_req_queue_size=32, - attached_memory_atom_size=64, - resp_queue_size=64 - ) - - self.vertex_mem_ctrl = SimpleMemory( - latency="30ns", - latency_var="0ns", - bandwidth="32GiB/s" - ) - - self.edge_mem_ctrl = MemCtrl( - dram=DDR4_2400_8x8( - range=AddrRange(edge_memory_size), - in_addr_map=False - ) - ) - - self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port - self.push_engine.mem_port = self.edge_mem_ctrl.port - - self.mpu = MPU( - wl_engine=self.wl_engine, - coalesce_engine=self.coalesce_engine, - push_engine=self.push_engine - ) - - def getRespPort(self): - return self.mpu.in_port - def setRespPort(self, port): - self.mpu.in_port = port - - def getReqPort(self): - return self.mpu.out_ports - def setReqPort(self, port): - self.mpu.out_ports = port - - def set_vertex_range(self, vertex_range): - self.vertex_mem_ctrl.range = vertex_range - - def set_edge_image(self, edge_image): - self.edge_mem_ctrl.dram.image_file = edge_image - -class SEGA(System): - def __init__(self, cache_size, graph_path): - super(SEGA, self).__init__() - self.clk_domain = SrcClockDomain() - self.clk_domain.clock = '2GHz' - self.clk_domain.voltage_domain = VoltageDomain() - self.cache_line_size = 32 - self.mem_mode = "timing" - - plain_vertex_range = AddrRange("4GiB") - self._vertex_ranges = interleave_addresses( - plain_vertex_range, - 1, - 32 - ) - - gpts = [GPT("8GiB", cache_size)] - gpts[0].set_vertex_ranges(self._vertex_ranges[0]) - gpts[0].set_edge_image(f"{graph_path}/edgelist_0") - gpts[0].setReqPort(gpts[0].getRespPort()) - self.gpts = gpts - - self.ctrl = CenteralController(image_file=f"{graph_path}/vertices") - self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] - - def create_initial_bfs_update(self, init_addr, init_value): - self.ctrl.createInitialBFSUpdate(init_addr, init_value) - -def get_inputs(): - argparser = argparse.ArgumentParser() - argparser.add_argument("cache_size", type=str) - argparser.add_argument("graph", type=str) - argparser.add_argument("init_addr", type=int) - argparser.add_argument("init_value", type=int) - - args = argparser.parse_args() - - return args.cache_size, args.graph, args.init_addr, args.init_value - -if __name__ == "__m5_main__": - cache_size, graph, init_addr, init_value = get_inputs() - - system = SEGA(cache_size, graph) - root = Root(full_system = False, system = system) - - m5.instantiate() - - system.create_initial_bfs_update(init_addr, init_value) - exit_event = m5.simulate() - print(f"Exited simulation at tick {m5.curTick()} " + \ - f"because {exit_event.getCause()}") diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 6b198c5f4a..fab414f2c5 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -48,21 +48,21 @@ class GPT(SubSystem): def __init__(self, edge_memory_size: str, cache_size: str): super().__init__() self.wl_engine = WLEngine( - update_queue_size=2, - register_file_size=2 + update_queue_size=128, + register_file_size=64 ) self.coalesce_engine = CoalesceEngine( attached_memory_atom_size=32, cache_size=cache_size, - num_mshr_entry=32, - num_tgts_per_mshr=32, - max_resp_per_cycle=4 + num_mshr_entry=64, + num_tgts_per_mshr=64, + max_resp_per_cycle=8 ) self.push_engine = PushEngine( push_req_queue_size=32, attached_memory_atom_size=64, resp_queue_size=64, - update_queue_size=2 + update_queue_size=16 ) self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2)) @@ -101,7 +101,7 @@ class SEGA(System): def __init__(self, num_mpus, cache_size, graph_path): super(SEGA, self).__init__() self.clk_domain = SrcClockDomain() - self.clk_domain.clock = '1GHz' + self.clk_domain.clock = '2GHz' self.clk_domain.voltage_domain = VoltageDomain() self.cache_line_size = 32 self.mem_mode = "timing" diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc index c8d0f636f2..44054d1efb 100644 --- a/src/accl/graph/sega/mpu.cc +++ b/src/accl/graph/sega/mpu.cc @@ -47,9 +47,6 @@ MPU::MPU(const Params& params): pushEngine->registerMPU(this); } -MPU::~MPU() -{} - void MPU::registerCenteralController(CenteralController* centeral_controller) { diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index a1e5055226..229bd28950 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -60,7 +60,6 @@ class MPU : public SimObject public: PARAMS(MPU); MPU(const Params& params); - ~MPU(); void registerCenteralController(CenteralController* centeral_controller); AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); } diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 238b8a89fb..5835b61fc6 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -53,7 +53,7 @@ PushEngine::PushEngine(const Params& params): { for (int i = 0; i < params.port_out_ports_connection_count; ++i) { outPorts.emplace_back( - name() + ".out_ports" + std::to_string(i), this, i); + name() + ".out_ports" + std::to_string(i), this, i); } } @@ -144,9 +144,12 @@ PushEngine::workLeft() bool PushEngine::done() { - return edgeQueue.empty() && - (onTheFlyMemReqs == 0) && - edgePointerQueue.empty(); + bool empty_update_queues = true; + for (int i = 0; i < outPorts.size(); i++) { + empty_update_queues &= updateQueues[outPorts[i].id()].empty(); + } + return empty_update_queues && edgeQueue.empty() && + (onTheFlyMemReqs == 0) && edgePointerQueue.empty(); } @@ -357,6 +360,16 @@ PushEngine::processNextPropagateEvent() } } +bool +contains(AddrRangeList range_list, Addr addr) +{ + bool found = false; + for (auto range: range_list) { + found |= range.contains(addr); + } + return found; +} + bool PushEngine::enqueueUpdate(Update update) { @@ -369,24 +382,32 @@ PushEngine::enqueueUpdate(Update update) DPRINTF(PushEngine, "%s: Received update: %s.\n", __func__, update.to_string()); for (int i = 0; i < outPorts.size(); i++) { AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()]; - for (auto range : addr_range_list) { - if (range.contains(dst_addr)) { - DPRINTF(PushEngine, "%s: Update: %s belongs to port %d.\n", __func__, update.to_string(), outPorts[i].id()); - DPRINTF(PushEngine, "%s: There are %d updates already in queue for port %d.\n", __func__, updateQueues[outPorts[i].id()].size(), outPorts[i].id()); - if (updateQueues[outPorts[i].id()].size() < updateQueueSize) { - DPRINTF(PushEngine, "%s: There is a free entry available in queue %d.\n", __func__, outPorts[i].id()); - updateQueues[outPorts[i].id()].emplace_back(update, curTick()); - DPRINTF(PushEngine, "%s: Emplaced the update at the back of queue for port %d is. Size of queue for port %d is %d.\n", __func__, outPorts[i].id(), outPorts[i].id(), updateQueues[outPorts[i].id()].size()); - accepted = true; - break; - } + if (contains(addr_range_list, dst_addr)) { + DPRINTF(PushEngine, "%s: Update: %s belongs to port %d.\n", + __func__, update.to_string(), outPorts[i].id()); + DPRINTF(PushEngine, "%s: There are %d updates already " + "in queue for port %d.\n", __func__, + updateQueues[outPorts[i].id()].size(), + outPorts[i].id()); + if (updateQueues[outPorts[i].id()].size() < updateQueueSize) { + DPRINTF(PushEngine, "%s: There is a free entry available " + "in queue %d.\n", __func__, outPorts[i].id()); + updateQueues[outPorts[i].id()].emplace_back(update, curTick()); + DPRINTF(PushEngine, "%s: Emplaced the update at the back " + "of queue for port %d is. Size of queue " + "for port %d is %d.\n", __func__, + outPorts[i].id(), outPorts[i].id(), + updateQueues[outPorts[i].id()].size()); + accepted = true; + stats.updateQueueLength.sample( + updateQueues[outPorts[i].id()].size()); } } } if (accepted && (!nextUpdatePushEvent.scheduled())) { schedule(nextUpdatePushEvent, nextCycle()); - } + } return accepted; } @@ -414,46 +435,31 @@ PushEngine::processNextUpdatePushEvent() { int next_time_send = 0; - // for (int i = 0; i < updateQueues.size(); i++) { - // if (updateQueues[i].empty()) { - // continue; - // } - // if (outPorts[i].blocked()) { - // continue; - // } - // Update update; - // Tick entrance_tick; - // std::tie(update, entrance_tick) = updateQueues[i].front(); - // PacketPtr pkt = createUpdatePacket(update.dst, update.value); - // outPorts[i].sendPacket(pkt); - // DPRINTF(PushEngine, "%s: Sent update from addr: %lu to addr: %lu with value: " - // "%d.\n", __func__, update.src, update.dst, update.value); - // updateQueues[i].pop_front(); - // DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size()); - // if (updateQueues[i].size() > 0) { - // next_time_send += 1; - // } - // } - for (int i = 0; i < outPorts.size(); i++) { if (outPorts[i].blocked()) { - DPRINTF(PushEngine, "%s: Port %d blocked.\n", __func__, outPorts[i].id()); + DPRINTF(PushEngine, "%s: Port %d blocked.\n", + __func__, outPorts[i].id()); continue; } - DPRINTF(PushEngine, "%s: Port %d available.\n", __func__, outPorts[i].id()); + DPRINTF(PushEngine, "%s: Port %d available.\n", + __func__, outPorts[i].id()); if (updateQueues[outPorts[i].id()].empty()) { - DPRINTF(PushEngine, "%s: Respective queue for port %d is empty.\n", __func__, outPorts[i].id()); + DPRINTF(PushEngine, "%s: Respective queue for port " + "%d is empty.\n", __func__, outPorts[i].id()); continue; } - DPRINTF(PushEngine, "%s: Respective queue for port %d not empty.\n", __func__, outPorts[i].id()); + DPRINTF(PushEngine, "%s: Respective queue for port " + "%d not empty.\n", __func__, outPorts[i].id()); Update update; Tick entrance_tick; std::tie(update, entrance_tick) = updateQueues[outPorts[i].id()].front(); PacketPtr pkt = createUpdatePacket(update.dst, update.value); outPorts[i].sendPacket(pkt); - DPRINTF(PushEngine, "%s: Sent update: %s from queue %d to port %d the queue size is %d.\n", __func__, update.to_string(), outPorts[i].id(), outPorts[i].id(), updateQueues[outPorts[i].id()].size()); + DPRINTF(PushEngine, "%s: Sent update: %s to port %d. " + "Respective queue size is %d.\n", __func__, + update.to_string(), outPorts[i].id(), + updateQueues[outPorts[i].id()].size()); updateQueues[outPorts[i].id()].pop_front(); - DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size()); if (updateQueues[outPorts[i].id()].size() > 0) { next_time_send += 1; } @@ -480,7 +486,9 @@ PushEngine::PushStats::PushStats(PushEngine &_push) ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(), "Histogram of the latency of the edgePointerQueue."), ADD_STAT(edgeQueueLatency, statistics::units::Second::get(), - "Histogram of the latency of the edgeQueue.") + "Histogram of the latency of the edgeQueue."), + ADD_STAT(updateQueueLength, statistics::units::Count::get(), + "Histogram of the length of updateQueues.") { } @@ -493,6 +501,7 @@ PushEngine::PushStats::regStats() edgePointerQueueLatency.init(64); edgeQueueLatency.init(64); + updateQueueLength.init(64); } } // namespace gem5 diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 4e0cdbc526..fbe527bcb6 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -51,7 +51,7 @@ class PushEngine : public BaseMemoryEngine public: ReqPort(const std::string& name, PushEngine* owner, PortID id) : - RequestPort(name, owner), + RequestPort(name, owner), owner(owner), blockedPacket(nullptr), _id(id) {} void sendPacket(PacketPtr pkt); @@ -132,7 +132,6 @@ class PushEngine : public BaseMemoryEngine uint32_t propagate(uint32_t value, uint32_t weight); int updateQueueSize; - // std::vector>> updateQueues; template PacketPtr createUpdatePacket(Addr addr, T value); bool enqueueUpdate(Update update); std::unordered_map portAddrMap; @@ -170,6 +169,7 @@ class PushEngine : public BaseMemoryEngine statistics::Histogram edgePointerQueueLatency; statistics::Histogram edgeQueueLatency; + statistics::Histogram updateQueueLength; }; PushStats stats; From 4b555f682145b9f7dbd306ac5ff7ce47a150dc03 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 6 Oct 2022 15:35:54 -0700 Subject: [PATCH 183/247] Back indent. --- configs/accl/sega-simple.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py index 54a90281bf..93267f0f24 100644 --- a/configs/accl/sega-simple.py +++ b/configs/accl/sega-simple.py @@ -31,18 +31,18 @@ from m5.objects import * def interleave_addresses(plain_range, num_channels, cache_line_size): - intlv_low_bit = log(cache_line_size, 2) - intlv_bits = log(num_channels, 2) - ret = [] - for i in range(num_channels): - ret.append(AddrRange( - start=plain_range.start, - size=plain_range.size(), - intlvHighBit=intlv_low_bit + intlv_bits - 1, - xorHighBit=0, - intlvBits=intlv_bits, - intlvMatch=i)) - return ret + intlv_low_bit = log(cache_line_size, 2) + intlv_bits = log(num_channels, 2) + ret = [] + for i in range(num_channels): + ret.append(AddrRange( + start=plain_range.start, + size=plain_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + xorHighBit=0, + intlvBits=intlv_bits, + intlvMatch=i)) + return ret class GPT(SubSystem): def __init__(self, edge_memory_size: str, cache_size: str): From fe68447f9d5b106c6802e2cd7e5e47718c0dd83c Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 7 Oct 2022 10:27:22 -0700 Subject: [PATCH 184/247] Fixed HBM range issue. --- configs/accl/sega-hbm.py | 163 ++++++++++++++++++++++ src/accl/graph/sega/base_memory_engine.cc | 11 +- src/accl/graph/sega/coalesce_engine.cc | 27 ---- src/base/addr_range.hh | 44 +++--- src/mem/HBMCtrl.py | 2 + src/mem/hbm_ctrl.cc | 10 +- src/mem/hbm_ctrl.hh | 3 +- 7 files changed, 202 insertions(+), 58 deletions(-) create mode 100644 configs/accl/sega-hbm.py diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py new file mode 100644 index 0000000000..da7d79d7fe --- /dev/null +++ b/configs/accl/sega-hbm.py @@ -0,0 +1,163 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import m5 +import argparse + +from math import log +from m5.objects import * + +def interleave_addresses(plain_range, num_channels, cache_line_size): + intlv_low_bit = log(cache_line_size, 2) + intlv_bits = log(num_channels, 2) + ret = [] + for i in range(num_channels): + ret.append(AddrRange( + start=plain_range.start, + size=plain_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + xorHighBit=0, + intlvBits=intlv_bits, + intlvMatch=i)) + return ret + +class GPT(SubSystem): + def __init__(self, edge_memory_size: str, cache_size: str): + super().__init__() + self.wl_engine = WLEngine( + update_queue_size=128, + register_file_size=64 + ) + self.coalesce_engine = CoalesceEngine( + attached_memory_atom_size=32, + cache_size=cache_size, + num_mshr_entry=64, + num_tgts_per_mshr=64, + max_resp_per_cycle=8 + ) + self.push_engine = PushEngine( + push_req_queue_size=32, + attached_memory_atom_size=64, + resp_queue_size=64, + update_queue_size=16 + ) + + self.vertex_mem_ctrl = HBMCtrl(dram=HBM_2000_4H_1x64(), + dram_2=HBM_2000_4H_1x64()) + + self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8( + range=AddrRange(edge_memory_size), + in_addr_map=False + ) + ) + + self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port + self.push_engine.mem_port = self.edge_mem_ctrl.port + + self.mpu = MPU( + wl_engine=self.wl_engine, + coalesce_engine=self.coalesce_engine, + push_engine=self.push_engine + ) + + def getRespPort(self): + return self.wl_engine.in_ports + def setRespPort(self, port): + self.wl_engine.in_ports = port + + def getReqPort(self): + return self.push_engine.out_ports + def setReqPort(self, port): + self.push_engine.out_ports = port + + def set_vertex_range(self, vertex_ranges): + self.vertex_mem_ctrl.dram.range = vertex_ranges[0] + self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1] + def set_vertex_pch_bit(self, pch_bit): + self.vertex_mem_ctrl.pch_bit = pch_bit + def set_edge_image(self, edge_image): + self.edge_mem_ctrl.dram.image_file = edge_image + +class SEGA(System): + def __init__(self, num_mpus, cache_size, graph_path): + super(SEGA, self).__init__() + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = '2GHz' + self.clk_domain.voltage_domain = VoltageDomain() + self.cache_line_size = 32 + self.mem_mode = "timing" + + self.ctrl = CenteralController(image_file=f"{graph_path}/vertices") + + vertex_ranges = interleave_addresses( + AddrRange(start=0, size="4GiB"), + 2*num_mpus, + 32 + ) + + gpts = [] + for i in range(num_mpus): + gpt = GPT("2GiB", cache_size) + gpt.set_vertex_range([vertex_ranges[i], vertex_ranges[i+num_mpus]]) + gpt.set_vertex_pch_bit(8) + gpt.set_edge_image(f"{graph_path}/edgelist_{i}") + gpts.append(gpt) + # Creating the interconnect among mpus + for gpt_0 in gpts: + for gpt_1 in gpts: + gpt_0.setReqPort(gpt_1.getRespPort()) + self.gpts = gpts + + self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] + + def create_initial_bfs_update(self, init_addr, init_value): + self.ctrl.createInitialBFSUpdate(init_addr, init_value) + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) + + args = argparser.parse_args() + + return args.num_gpts, args.cache_size, \ + args.graph, args.init_addr, args.init_value + +if __name__ == "__m5_main__": + num_gpts, cache_size, graph, init_addr, init_value = get_inputs() + + system = SEGA(num_gpts, cache_size, graph) + root = Root(full_system = False, system = system) + + m5.instantiate() + + system.create_initial_bfs_update(init_addr, init_value) + exit_event = m5.simulate() + print(f"Exited simulation at tick {m5.curTick()} " + \ + f"because {exit_event.getCause()}") diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc index d9864664b1..9f704f71e9 100644 --- a/src/accl/graph/sega/base_memory_engine.cc +++ b/src/accl/graph/sega/base_memory_engine.cc @@ -60,13 +60,10 @@ BaseMemoryEngine::init() { AddrRangeList memory_ranges = memPort.getAddrRanges(); - if (memory_ranges.size() == 2) { - peerMemoryRange = merge(memory_ranges.front(), memory_ranges.back()); - } else if (memory_ranges.size() == 1) { - peerMemoryRange = memory_ranges.front(); - } else { - panic("Received an unacceptable number of ranges from memory."); - } + assert(memory_ranges.size() == 1); + + peerMemoryRange = memory_ranges.front(); + DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is " "%s. The range is %s interleaved.\n", __func__, peerMemoryRange.to_string(), diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 0a4a041176..f4cd6a950d 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -127,15 +127,6 @@ int CoalesceEngine::getBlockIndex(Addr addr) { assert((addr % peerMemoryAtomSize) == 0); - // bool found = false; - // Addr trimmed_addr; - // for (auto range: peerMemoryRanges) { - // if (range.contains(addr)) { - // trimmed_addr = range.removeIntlvBits(addr); - // found = true; - // } - // } - // assert(found); Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines; } @@ -145,15 +136,6 @@ int CoalesceEngine::getBitIndexBase(Addr addr) { assert((addr % peerMemoryAtomSize) == 0); - // bool found = false; - // Addr trimmed_addr; - // for (auto range: peerMemoryRanges) { - // if (range.contains(addr)) { - // trimmed_addr = range.removeIntlvBits(addr); - // found = true; - // } - // } - // assert(found); Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); int atom_index = (int) (trimmed_addr / peerMemoryAtomSize); int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem)); @@ -165,16 +147,7 @@ Addr CoalesceEngine::getBlockAddrFromBitIndex(int index) { assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0); - // bool found = false; Addr trimmed_addr = index * sizeof(WorkListItem); - // Addr upgraded_addr; - // for (auto range: peerMemoryRanges) { - // if (range.contains(trimmed_addr)) { - // upgraded_addr = range.addIntlvBits(trimmed_addr); - // found = true; - // } - // } - // assert(found); return peerMemoryRange.addIntlvBits(trimmed_addr); } diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh index a4bf581224..339fdb6c55 100644 --- a/src/base/addr_range.hh +++ b/src/base/addr_range.hh @@ -735,33 +735,37 @@ class AddrRange } friend AddrRange - merge(const AddrRange& left, const AddrRange& right) + mergePseudoChannelRanges(AddrRange left, AddrRange right, int pch_bit) { assert(left.interleaved()); assert(right.interleaved()); assert(left.mergesWith(right)); - int bits_org = left.masks.size(); - int bits_new = bits_org - 1; - - int left_match = left.intlvMatch; - int right_match = right.intlvMatch; - assert(std::abs(left_match - right_match) == (1 << bits_new)); - - Addr last_mask = left.masks[left.masks.size() - 1]; - int xor_high_bit_org = 0; - int xor_high_bit_new = 0; - if (!isPowerOf2(last_mask)) { - xor_high_bit_org = ceilLog2(last_mask); - xor_high_bit_new = xor_high_bit_org - 2; + uint8_t old_left_match = left.intlvMatch; + uint8_t new_left_match = 0; + uint8_t old_right_match = right.intlvMatch; + uint8_t new_right_match = 0; + int new_bits = left.masks.size() - 1; + + // assumption: masks is sorted in ascending order + std::vector new_masks; + for (auto mask: left.masks) { + uint64_t lsb_mask = (mask ^ (mask - 1)) + 1; + if ((lsb_mask >> 1) != (1 << pch_bit)) { + new_masks.push_back(mask); + new_left_match |= ((old_left_match & 1) << new_bits); + new_left_match >>= 1; + new_right_match |= ((old_right_match & 1) << new_bits); + new_right_match >>= 1; + } + old_left_match >>= 1; + old_right_match >>= 1; } - int intlv_high_bit_org = - ceilLog2(last_mask ^ (1 << xor_high_bit_org)); - int intlv_high_bit_new = intlv_high_bit_org - 2; + panic_if(new_left_match != new_right_match, + "The two ranges can not be a pseudo channel pair " + "given the pseudochannel bit position of params.pch_bit."); - int match = std::min(left_match, right_match); - return AddrRange(left._start, left._end, intlv_high_bit_new, - xor_high_bit_new, bits_new, match); + return AddrRange(left._start, left._end, new_masks, new_left_match); } }; diff --git a/src/mem/HBMCtrl.py b/src/mem/HBMCtrl.py index 0c7c1ea919..f7355d4b67 100644 --- a/src/mem/HBMCtrl.py +++ b/src/mem/HBMCtrl.py @@ -42,6 +42,8 @@ class HBMCtrl(MemCtrl): # HBMCtrl has been tested with two HBM_2000_4H_1x64 interfaces dram_2 = Param.DRAMInterface("DRAM memory interface") + pch_bit = Param.Int("Position of PseudoChannel bit in addresses.") + # For mixed traffic, HBMCtrl with HBM_2000_4H_1x64 interfaaces # gives the best results with following min_r/w_per_switch min_reads_per_switch = 64 diff --git a/src/mem/hbm_ctrl.cc b/src/mem/hbm_ctrl.cc index 99618c4b5f..efd46bbd54 100644 --- a/src/mem/hbm_ctrl.cc +++ b/src/mem/hbm_ctrl.cc @@ -45,6 +45,7 @@ namespace memory HBMCtrl::HBMCtrl(const HBMCtrlParams &p) : MemCtrl(p), + pchBit(p.pch_bit), retryRdReqPC1(false), retryWrReqPC1(false), nextReqEventPC1([this] {processNextReqEvent(pc1Int, respQueuePC1, respondEventPC1, nextReqEventPC1, retryWrReqPC1);}, @@ -233,7 +234,7 @@ HBMCtrl::recvTimingReq(PacketPtr pkt) bool is_pc0; // TODO: make the interleaving bit across pseudo channels a parameter - if (bits(pkt->getAddr(), 6) == 0) { + if (bits(pkt->getAddr(), pchBit) == 0) { is_pc0 = true; } else { is_pc0 = false; @@ -492,8 +493,11 @@ AddrRangeList HBMCtrl::getAddrRanges() { AddrRangeList ranges; - ranges.push_back(pc0Int->getAddrRange()); - ranges.push_back(pc1Int->getAddrRange()); + AddrRange pc0Int_range = pc0Int->getAddrRange(); + AddrRange pc1Int_range = pc1Int->getAddrRange(); + ranges.push_back( + mergePseudoChannelRanges(pc0Int_range, pc1Int_range, pchBit) + ); return ranges; } diff --git a/src/mem/hbm_ctrl.hh b/src/mem/hbm_ctrl.hh index c9045f0ae7..f204b8346f 100644 --- a/src/mem/hbm_ctrl.hh +++ b/src/mem/hbm_ctrl.hh @@ -72,7 +72,8 @@ class HBMCtrl : public MemCtrl } private: - + // Position of the pseudochannel bit in addresses. + int pchBit; /** * Remember if we have to retry a request for second pseudo channel. */ From d30ddb5df9c64082e10ff101b4064e41bbf41029 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Fri, 7 Oct 2022 11:49:25 -0700 Subject: [PATCH 185/247] Refactoring reading edges from memory --- src/accl/graph/sega/push_engine.cc | 41 +++++++++++++----------------- src/accl/graph/sega/push_engine.hh | 10 ++++++-- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 5835b61fc6..7265cec1a4 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -148,7 +148,7 @@ PushEngine::done() for (int i = 0; i < outPorts.size(); i++) { empty_update_queues &= updateQueues[outPorts[i].id()].empty(); } - return empty_update_queues && edgeQueue.empty() && + return empty_update_queues && metaEdgeQueue.empty() && (onTheFlyMemReqs == 0) && edgePointerQueue.empty(); } @@ -230,13 +230,13 @@ PushEngine::processNextMemoryReadEvent() nextMemoryReadEvent.sleep(); return; } + Addr aligned_addr, offset; + int num_edges; - if (edgeQueue.size() < (edgeQueueSize - onTheFlyMemReqs)) { - Addr aligned_addr, offset; - int num_edges; - - EdgeReadInfoGen &curr_info = edgePointerQueue.front(); - std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo(); + EdgeReadInfoGen& curr_info = edgePointerQueue.front(); + std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo(); + if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges))) + { DPRINTF(PushEngine, "%s: Current packet information generated by " "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, " "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges); @@ -246,8 +246,9 @@ PushEngine::processNextMemoryReadEvent() reqInfoMap[pkt->req] = push_info; memPort.sendPacket(pkt); - onTheFlyMemReqs++; + onTheFlyMemReqs += num_edges; + curr_info.iterate(); if (curr_info.done()) { DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__); stats.edgePointerQueueLatency.sample( @@ -290,19 +291,16 @@ PushEngine::handleMemResp(PacketPtr pkt) PushInfo push_info = reqInfoMap[pkt->req]; pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize); - std::deque> edges; for (int i = 0; i < push_info.numElements; i++) { Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge)); Addr edge_dst = edge->neighbor; uint32_t edge_weight = edge->weight; MetaEdge meta_edge( push_info.src, edge_dst, edge_weight, push_info.value); - edges.emplace_back(meta_edge, curTick()); + metaEdgeQueue.emplace_back(meta_edge, curTick()); } - assert(!edges.empty()); - edgeQueue.push_back(edges); - onTheFlyMemReqs--; + onTheFlyMemReqs -= push_info.numElements; reqInfoMap.erase(pkt->req); delete pkt_data; delete pkt; @@ -318,17 +316,16 @@ PushEngine::processNextPropagateEvent() { int num_propagates = 0; while(true) { - std::deque>& edge_list = edgeQueue.front(); MetaEdge meta_edge; Tick entrance_tick; - std::tie(meta_edge, entrance_tick) = edge_list.front(); + std::tie(meta_edge, entrance_tick) = metaEdgeQueue.front(); DPRINTF(PushEngine, "%s: The edge to process is %s.\n", __func__, meta_edge.to_string()); uint32_t update_value = propagate(meta_edge.value, meta_edge.weight); Update update(meta_edge.src, meta_edge.dst, update_value); - edge_list.pop_front(); + metaEdgeQueue.pop_front(); if (enqueueUpdate(update)) { DPRINTF(PushEngine, "%s: Sent %s to port queues.\n", @@ -337,14 +334,10 @@ PushEngine::processNextPropagateEvent() stats.edgeQueueLatency.sample( (curTick() - entrance_tick) * 1e9 / getClockFrequency()); } else { - edge_list.emplace_back(meta_edge, entrance_tick); - } - - if (edge_list.empty()) { - edgeQueue.pop_front(); + metaEdgeQueue.emplace_back(meta_edge, entrance_tick); } - if (edgeQueue.empty()) { + if (metaEdgeQueue.empty()) { break; } @@ -355,7 +348,7 @@ PushEngine::processNextPropagateEvent() } assert(!nextPropagateEvent.scheduled()); - if (!edgeQueue.empty()) { + if (!metaEdgeQueue.empty()) { schedule(nextPropagateEvent, nextCycle()); } } @@ -486,7 +479,7 @@ PushEngine::PushStats::PushStats(PushEngine &_push) ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(), "Histogram of the latency of the edgePointerQueue."), ADD_STAT(edgeQueueLatency, statistics::units::Second::get(), - "Histogram of the latency of the edgeQueue."), + "Histogram of the latency of the metaEdgeQueue."), ADD_STAT(updateQueueLength, statistics::units::Count::get(), "Histogram of the length of updateQueues.") { diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index fbe527bcb6..cc087aff11 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -93,11 +93,17 @@ class PushEngine : public BaseMemoryEngine } else { num_items = (_end - _start) / _step; } - _start = aligned_addr + _atom; return std::make_tuple(aligned_addr, offset, num_items); } + void iterate() + { + panic_if(done(), "Should not call iterate when done.\n"); + Addr aligned_addr = roundDown(_start, _atom); + _start = aligned_addr + _atom; + } + bool done() { return (_start >= _end); } Addr src() { return _src; } @@ -126,7 +132,7 @@ class PushEngine : public BaseMemoryEngine int onTheFlyMemReqs; int edgeQueueSize; int maxPropagatesPerCycle; - std::deque>> edgeQueue; + std::deque> metaEdgeQueue; std::string workload; uint32_t propagate(uint32_t value, uint32_t weight); From 7a6ab86032f9480e0c8d733a3968aa34f8d0eea2 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Fri, 7 Oct 2022 13:33:25 -0700 Subject: [PATCH 186/247] Added statistics to calculate number of propagates sent --- src/accl/graph/sega/push_engine.cc | 10 +++++++--- src/accl/graph/sega/push_engine.hh | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 7265cec1a4..4b3277d3e1 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -336,17 +336,18 @@ PushEngine::processNextPropagateEvent() } else { metaEdgeQueue.emplace_back(meta_edge, entrance_tick); } + num_propagates++; if (metaEdgeQueue.empty()) { break; } - - num_propagates++; if (num_propagates >= maxPropagatesPerCycle) { break; } } + stats.numPropagates.sample(num_propagates); + assert(!nextPropagateEvent.scheduled()); if (!metaEdgeQueue.empty()) { schedule(nextPropagateEvent, nextCycle()); @@ -481,7 +482,9 @@ PushEngine::PushStats::PushStats(PushEngine &_push) ADD_STAT(edgeQueueLatency, statistics::units::Second::get(), "Histogram of the latency of the metaEdgeQueue."), ADD_STAT(updateQueueLength, statistics::units::Count::get(), - "Histogram of the length of updateQueues.") + "Histogram of the length of updateQueues."), + ADD_STAT(numPropagates, statistics::units::Count::get(), + "Histogram of number of propagates sent.") { } @@ -495,6 +498,7 @@ PushEngine::PushStats::regStats() edgePointerQueueLatency.init(64); edgeQueueLatency.init(64); updateQueueLength.init(64); + numPropagates.init(push.params().max_propagates_per_cycle); } } // namespace gem5 diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index cc087aff11..c078391420 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -176,6 +176,7 @@ class PushEngine : public BaseMemoryEngine statistics::Histogram edgePointerQueueLatency; statistics::Histogram edgeQueueLatency; statistics::Histogram updateQueueLength; + statistics::Histogram numPropagates; }; PushStats stats; From 0bd83b6cc1c661fa484ab5d0a527d0a3d1e93722 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Sat, 8 Oct 2022 16:25:41 -0700 Subject: [PATCH 187/247] Adding coalescing to pushEngine --- src/accl/graph/sega/push_engine.cc | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 4b3277d3e1..79e5344395 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -368,6 +368,7 @@ bool PushEngine::enqueueUpdate(Update update) { Addr dst_addr = update.dst; + bool fount_coalescing = false; bool found_locally = false; bool accepted = false; for (auto range : localAddrRange) { @@ -383,7 +384,26 @@ PushEngine::enqueueUpdate(Update update) "in queue for port %d.\n", __func__, updateQueues[outPorts[i].id()].size(), outPorts[i].id()); - if (updateQueues[outPorts[i].id()].size() < updateQueueSize) { + for (auto itr = updateQueues[outPorts[i].id()].begin(); + itr != updateQueues[outPorts[i].id()].end(); + itr++){ + std::tuple curr_update = *itr; + if (std::get<0>(curr_update).dst == update.dst){ + uint32_t value = + std::min(std::get<0>(curr_update).value, update.value); + DPRINTF(PushEngine, "%s: found a coalescing opportunity " + "for destination %d new value: %d by comparing %d " + "and %d. \n", __func__, update.dst, value, + std::get<0>(curr_update).value, update.value); + fount_coalescing = true; + update.value = value; + updateQueues[outPorts[i].id()].erase(itr); + updateQueues[outPorts[i].id()].emplace_back(update, curTick()); + break; + } + } + if ((fount_coalescing == false) && + (updateQueues[outPorts[i].id()].size() < updateQueueSize)) { DPRINTF(PushEngine, "%s: There is a free entry available " "in queue %d.\n", __func__, outPorts[i].id()); updateQueues[outPorts[i].id()].emplace_back(update, curTick()); @@ -398,6 +418,7 @@ PushEngine::enqueueUpdate(Update update) } } } + fount_coalescing = false; if (accepted && (!nextUpdatePushEvent.scheduled())) { schedule(nextUpdatePushEvent, nextCycle()); From 9f052dcf27a64d21582f48f41eb032bb1fe48464 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 8 Oct 2022 19:49:58 -0700 Subject: [PATCH 188/247] Adding function to print final answer. --- configs/accl/sega-hbm.py | 18 +++-- configs/accl/sega-simple.py | 2 +- configs/accl/sega.py | 2 +- src/accl/graph/sega/CenteralController.py | 5 +- src/accl/graph/sega/centeral_controller.cc | 44 +++++++++++- src/accl/graph/sega/centeral_controller.hh | 3 + src/accl/graph/sega/push_engine.cc | 80 ++++++++++++---------- src/accl/graph/sega/push_engine.hh | 9 ++- src/base/addr_range.hh | 10 +++ 9 files changed, 125 insertions(+), 48 deletions(-) diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py index da7d79d7fe..70aac6c2cb 100644 --- a/configs/accl/sega-hbm.py +++ b/configs/accl/sega-hbm.py @@ -61,8 +61,8 @@ def __init__(self, edge_memory_size: str, cache_size: str): self.push_engine = PushEngine( push_req_queue_size=32, attached_memory_atom_size=64, - resp_queue_size=64, - update_queue_size=16 + resp_queue_size=512, + update_queue_size=32 ) self.vertex_mem_ctrl = HBMCtrl(dram=HBM_2000_4H_1x64(), @@ -136,6 +136,9 @@ def __init__(self, num_mpus, cache_size, graph_path): def create_initial_bfs_update(self, init_addr, init_value): self.ctrl.createInitialBFSUpdate(init_addr, init_value) + def print_answer(self): + self.ctrl.printAnswerToHostSimout() + def get_inputs(): argparser = argparse.ArgumentParser() argparser.add_argument("num_gpts", type=int) @@ -143,14 +146,19 @@ def get_inputs(): argparser.add_argument("graph", type=str) argparser.add_argument("init_addr", type=int) argparser.add_argument("init_value", type=int) + argparser.add_argument("--verify", type=bool, help="Print final answer") args = argparser.parse_args() + verify = False + if not args.verify is None: + verify = args.verify + return args.num_gpts, args.cache_size, \ - args.graph, args.init_addr, args.init_value + args.graph, args.init_addr, args.init_value, verify if __name__ == "__m5_main__": - num_gpts, cache_size, graph, init_addr, init_value = get_inputs() + num_gpts, cache_size, graph, init_addr, init_value, verify = get_inputs() system = SEGA(num_gpts, cache_size, graph) root = Root(full_system = False, system = system) @@ -161,3 +169,5 @@ def get_inputs(): exit_event = m5.simulate() print(f"Exited simulation at tick {m5.curTick()} " + \ f"because {exit_event.getCause()}") + if verify: + system.print_answer() diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py index 93267f0f24..7ec19c92ae 100644 --- a/configs/accl/sega-simple.py +++ b/configs/accl/sega-simple.py @@ -62,7 +62,7 @@ def __init__(self, edge_memory_size: str, cache_size: str): push_req_queue_size=32, attached_memory_atom_size=64, resp_queue_size=64, - update_queue_size=16, + update_queue_size=32, ) self.vertex_mem_ctrl = SimpleMemory( diff --git a/configs/accl/sega.py b/configs/accl/sega.py index fab414f2c5..c50c525297 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -62,7 +62,7 @@ def __init__(self, edge_memory_size: str, cache_size: str): push_req_queue_size=32, attached_memory_atom_size=64, resp_queue_size=64, - update_queue_size=16 + update_queue_size=32 ) self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2)) diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index 0721ff977c..2ba53c231f 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -41,4 +41,7 @@ class CenteralController(ClockedObject): mpu_vector = VectorParam.MPU("All mpus in the system.") - cxx_exports = [PyBindMethod("createInitialBFSUpdate")] + cxx_exports = [ + PyBindMethod("createInitialBFSUpdate"), + PyBindMethod("printAnswerToHostSimout") + ] diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 68b88e9e77..7c89c1edea 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -28,6 +28,9 @@ #include "accl/graph/sega/centeral_controller.hh" +#include + +#include "base/cprintf.hh" #include "base/loader/memory_image.hh" #include "base/loader/object_file.hh" #include "debug/CenteralController.hh" @@ -62,7 +65,7 @@ CenteralController::initState() loader::debugSymbolTable.insert(*object->symtab().globals()); loader::MemoryImage image = object->buildImage(); - Addr maxVertexAddr = image.maxAddr(); + maxVertexAddr = image.maxAddr(); PortProxy proxy( [this](PacketPtr pkt) { @@ -97,6 +100,21 @@ CenteralController::startup() } } +PacketPtr +CenteralController::createReadPacket(Addr addr, unsigned int size) +{ + RequestPtr req = std::make_shared(addr, size, 0, 0); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) 0) << 2); + + // Embed it in a packet + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + pkt->allocate(); + + return pkt; +} + template PacketPtr CenteralController::createUpdatePacket(Addr addr, T value) { @@ -134,4 +152,28 @@ CenteralController::recvDoneSignal() } } +void +CenteralController::printAnswerToHostSimout() +{ + int num_items = system->cacheLineSize() / sizeof(WorkListItem); + WorkListItem items[num_items]; + for (Addr addr = 0; addr < maxVertexAddr; addr += system->cacheLineSize()) + { + PacketPtr pkt = createReadPacket(addr, system->cacheLineSize()); + for (auto mpu: mpuVector) { + AddrRangeList range_list = addrRangeListMap[mpu]; + if (contains(range_list, addr)) { + mpu->recvFunctional(pkt); + } + } + pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize()); + for (int i = 0; i < num_items; i++) { + std::string print = csprintf("WorklistItem[%lu][%d]: %s.", + addr, i, items[i].to_string()); + + std::cout << print << std::endl; + } + } +} + } diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index 4a4e9c7cb1..d006851e3b 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -53,6 +53,7 @@ class CenteralController : public ClockedObject std::vector mpuVector; std::unordered_map addrRangeListMap; + PacketPtr createReadPacket(Addr addr, unsigned int size); template PacketPtr createUpdatePacket(Addr addr, T value); public: @@ -64,6 +65,8 @@ class CenteralController : public ClockedObject void createInitialBFSUpdate(Addr init_addr, uint32_t init_value); void recvDoneSignal(); + + void printAnswerToHostSimout(); }; } diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 79e5344395..d5fb002f82 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -43,7 +43,6 @@ PushEngine::PushEngine(const Params& params): numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size), onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size), maxPropagatesPerCycle(params.max_propagates_per_cycle), - workload(params.workload), updateQueueSize(params.update_queue_size), nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()), nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()), @@ -152,10 +151,23 @@ PushEngine::done() (onTheFlyMemReqs == 0) && edgePointerQueue.empty(); } +uint32_t +PushEngine::reduce(uint32_t update, uint32_t value) +{ + std::string workload = params().workload; + uint32_t new_value; + if(workload == "BFS"){ + new_value = std::min(update, value); + } else{ + panic("Workload not implemented\n"); + } + return new_value; +} uint32_t PushEngine::propagate(uint32_t value, uint32_t weight) { + std::string workload = params().workload; uint32_t update; if (workload == "BFS") { update = value + 1; @@ -235,7 +247,7 @@ PushEngine::processNextMemoryReadEvent() EdgeReadInfoGen& curr_info = edgePointerQueue.front(); std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo(); - if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges))) + if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges))) { DPRINTF(PushEngine, "%s: Current packet information generated by " "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, " @@ -299,6 +311,8 @@ PushEngine::handleMemResp(PacketPtr pkt) push_info.src, edge_dst, edge_weight, push_info.value); metaEdgeQueue.emplace_back(meta_edge, curTick()); } + stats.numWastefulEdgesRead += + (peerMemoryAtomSize / sizeof(Edge)) - push_info.numElements; onTheFlyMemReqs -= push_info.numElements; reqInfoMap.erase(pkt->req); @@ -330,7 +344,7 @@ PushEngine::processNextPropagateEvent() if (enqueueUpdate(update)) { DPRINTF(PushEngine, "%s: Sent %s to port queues.\n", __func__, meta_edge.to_string()); - stats.numUpdates++; + stats.numPropagates++; stats.edgeQueueLatency.sample( (curTick() - entrance_tick) * 1e9 / getClockFrequency()); } else { @@ -346,7 +360,7 @@ PushEngine::processNextPropagateEvent() } } - stats.numPropagates.sample(num_propagates); + stats.numPropagatesHist.sample(num_propagates); assert(!nextPropagateEvent.scheduled()); if (!metaEdgeQueue.empty()) { @@ -354,21 +368,11 @@ PushEngine::processNextPropagateEvent() } } -bool -contains(AddrRangeList range_list, Addr addr) -{ - bool found = false; - for (auto range: range_list) { - found |= range.contains(addr); - } - return found; -} - bool PushEngine::enqueueUpdate(Update update) { Addr dst_addr = update.dst; - bool fount_coalescing = false; + bool found_coalescing = false; bool found_locally = false; bool accepted = false; for (auto range : localAddrRange) { @@ -384,25 +388,21 @@ PushEngine::enqueueUpdate(Update update) "in queue for port %d.\n", __func__, updateQueues[outPorts[i].id()].size(), outPorts[i].id()); - for (auto itr = updateQueues[outPorts[i].id()].begin(); - itr != updateQueues[outPorts[i].id()].end(); - itr++){ - std::tuple curr_update = *itr; - if (std::get<0>(curr_update).dst == update.dst){ - uint32_t value = - std::min(std::get<0>(curr_update).value, update.value); + for (auto& entry: updateQueues[outPorts[i].id()]) { + Update& curr_update = std::get<0>(entry); + if (curr_update.dst == update.dst) { + uint32_t old_value = curr_update.value; + curr_update.value = reduce(old_value, update.value); DPRINTF(PushEngine, "%s: found a coalescing opportunity " - "for destination %d new value: %d by comparing %d " - "and %d. \n", __func__, update.dst, value, - std::get<0>(curr_update).value, update.value); - fount_coalescing = true; - update.value = value; - updateQueues[outPorts[i].id()].erase(itr); - updateQueues[outPorts[i].id()].emplace_back(update, curTick()); - break; + "for destination %d with new value: %d by " + "coalescing %d and %d. \n", __func__, update.dst, + curr_update.value, old_value, update.value); + found_coalescing = true; + accepted = true; + stats.updateQueueCoalescions++; } } - if ((fount_coalescing == false) && + if ((found_coalescing == false) && (updateQueues[outPorts[i].id()].size() < updateQueueSize)) { DPRINTF(PushEngine, "%s: There is a free entry available " "in queue %d.\n", __func__, outPorts[i].id()); @@ -418,7 +418,6 @@ PushEngine::enqueueUpdate(Update update) } } } - fount_coalescing = false; if (accepted && (!nextUpdatePushEvent.scheduled())) { schedule(nextUpdatePushEvent, nextCycle()); @@ -478,6 +477,7 @@ PushEngine::processNextUpdatePushEvent() if (updateQueues[outPorts[i].id()].size() > 0) { next_time_send += 1; } + stats.numUpdates++; } assert(!nextUpdatePushEvent.scheduled()); @@ -489,12 +489,18 @@ PushEngine::processNextUpdatePushEvent() PushEngine::PushStats::PushStats(PushEngine &_push) : statistics::Group(&_push), push(_push), - ADD_STAT(numUpdates, statistics::units::Count::get(), - "Number of sent updates."), + ADD_STAT(numPropagates, statistics::units::Count::get(), + "Number of propagate operations done."), ADD_STAT(numNetBlocks, statistics::units::Count::get(), "Number of updates blocked by network."), ADD_STAT(numIdleCycles, statistics::units::Count::get(), "Number of cycles PushEngine has been idle."), + ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(), + "Number of coalescions in the update queues."), + ADD_STAT(numUpdates, statistics::units::Count::get(), + "Number of updates sent to the network."), + ADD_STAT(numWastefulEdgesRead, statistics::units::Count::get(), + "Number of wasteful edges read from edge memory."), ADD_STAT(TEPS, statistics::units::Rate::get(), "Traversed Edges Per Second."), @@ -504,7 +510,7 @@ PushEngine::PushStats::PushStats(PushEngine &_push) "Histogram of the latency of the metaEdgeQueue."), ADD_STAT(updateQueueLength, statistics::units::Count::get(), "Histogram of the length of updateQueues."), - ADD_STAT(numPropagates, statistics::units::Count::get(), + ADD_STAT(numPropagatesHist, statistics::units::Count::get(), "Histogram of number of propagates sent.") { } @@ -514,12 +520,12 @@ PushEngine::PushStats::regStats() { using namespace statistics; - TEPS = numUpdates / simSeconds; + TEPS = numPropagates / simSeconds; edgePointerQueueLatency.init(64); edgeQueueLatency.init(64); updateQueueLength.init(64); - numPropagates.init(push.params().max_propagates_per_cycle); + numPropagatesHist.init(push.params().max_propagates_per_cycle); } } // namespace gem5 diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index c078391420..6163ba5c27 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -134,7 +134,7 @@ class PushEngine : public BaseMemoryEngine int maxPropagatesPerCycle; std::deque> metaEdgeQueue; - std::string workload; + uint32_t reduce(uint32_t update, uint32_t value); uint32_t propagate(uint32_t value, uint32_t weight); int updateQueueSize; @@ -167,16 +167,19 @@ class PushEngine : public BaseMemoryEngine PushEngine &push; - statistics::Scalar numUpdates; + statistics::Scalar numPropagates; statistics::Scalar numNetBlocks; statistics::Scalar numIdleCycles; + statistics::Scalar updateQueueCoalescions; + statistics::Scalar numUpdates; + statistics::Scalar numWastefulEdgesRead; statistics::Formula TEPS; statistics::Histogram edgePointerQueueLatency; statistics::Histogram edgeQueueLatency; statistics::Histogram updateQueueLength; - statistics::Histogram numPropagates; + statistics::Histogram numPropagatesHist; }; PushStats stats; diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh index 339fdb6c55..3c5c150b29 100644 --- a/src/base/addr_range.hh +++ b/src/base/addr_range.hh @@ -852,6 +852,16 @@ RangeSize(Addr start, Addr size) return AddrRange(start, start + size); } +inline bool +contains(AddrRangeList range_list, Addr addr) +{ + bool ret = false; + for (auto range: range_list) { + ret |= range.contains(addr); + } + return ret; +} + } // namespace gem5 #endif // __BASE_ADDR_RANGE_HH__ From cc19d17fc22d22377f2d3d56c43fe981fb66f70f Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 9 Oct 2022 17:15:04 -0700 Subject: [PATCH 189/247] Typos. --- configs/accl/real-graph-gen.py | 74 +++++++++++++++++++ configs/accl/sega-hbm.py | 14 ++-- .../accl/{graph-gen.py => synth-graph-gen.py} | 0 src/accl/graph/sega/centeral_controller.cc | 2 +- src/accl/graph/sega/wl_engine.cc | 12 +-- src/accl/graph/sega/wl_engine.hh | 2 +- 6 files changed, 89 insertions(+), 15 deletions(-) create mode 100644 configs/accl/real-graph-gen.py rename configs/accl/{graph-gen.py => synth-graph-gen.py} (100%) diff --git a/configs/accl/real-graph-gen.py b/configs/accl/real-graph-gen.py new file mode 100644 index 0000000000..db44c63a9a --- /dev/null +++ b/configs/accl/real-graph-gen.py @@ -0,0 +1,74 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import os +import argparse +import subprocess + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("path", type=str, help="Path to the graph file.") + argparser.add_argument("num_gpts", type=int, help="Number gpts to create synth graph binaries for.") + + args = argparser.parse_args() + return args.path, args.num_gpts + +if __name__ == "__main__": + graph_path, num_gpts = get_inputs() + + graph_reader = os.environ.get("GRAPH_READER") + + if graph_reader is None: + raise ValueError(f"No value for $GRAPH_READER.") + + if not os.path.exists(graph_path): + raise ValueError(f"{graph_path} does not exist.") + + graph_dir = os.path.dirname(graph_path) + if not "binaries" in os.listdir(graph_dir): + print(f"binaries directory not found in {graph_dir}") + os.mkdir(f"{graph_dir}/binaries") + print(f"Created {graph_dir}/binaries") + + if not f"gpts_{num_gpts}" in os.listdir(f"{graph_dir}/binaries"): + print(f"gpts_{num_gpts} not found in {graph_dir}/binaries") + os.mkdir(f"{graph_dir}/binaries/gpts_{num_gpts}") + print(f"Created {graph_dir}/binaries/gpts_{num_gpts}") + + expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)] + if not all([binary in os.listdir(f"{graph_dir}/binaries/gpts_{num_gpts}") for binary in expected_bins]): + print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}") + for delete in os.scandir(f"{graph_dir}/binaries/gpts_{num_gpts}"): + os.remove(delete.path) + print(f"Deleted all the files in {graph_dir}/binaries/gpts_{num_gpts}") + subprocess.run([f"{graph_reader}" , + f"{graph_path}", + "false", + f"{num_gpts}", + "32", + f"{graph_dir}/binaries/gpts_{num_gpts}"]) + print(f"Created the graph binaries in " + f"{graph_dir}/binaries/gpts_{num_gpts}") diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py index 70aac6c2cb..cdc752f2bd 100644 --- a/configs/accl/sega-hbm.py +++ b/configs/accl/sega-hbm.py @@ -42,7 +42,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size): xorHighBit=0, intlvBits=intlv_bits, intlvMatch=i)) - return ret + return ret, intlv_low_bit + intlv_bits - 1 class GPT(SubSystem): def __init__(self, edge_memory_size: str, cache_size: str): @@ -112,17 +112,17 @@ def __init__(self, num_mpus, cache_size, graph_path): self.ctrl = CenteralController(image_file=f"{graph_path}/vertices") - vertex_ranges = interleave_addresses( - AddrRange(start=0, size="4GiB"), - 2*num_mpus, - 32 - ) + vertex_ranges, pch_bit = interleave_addresses( + AddrRange(start=0, size="4GiB"), + 2*num_mpus, + 32 + ) gpts = [] for i in range(num_mpus): gpt = GPT("2GiB", cache_size) gpt.set_vertex_range([vertex_ranges[i], vertex_ranges[i+num_mpus]]) - gpt.set_vertex_pch_bit(8) + gpt.set_vertex_pch_bit(pch_bit) gpt.set_edge_image(f"{graph_path}/edgelist_{i}") gpts.append(gpt) # Creating the interconnect among mpus diff --git a/configs/accl/graph-gen.py b/configs/accl/synth-graph-gen.py similarity index 100% rename from configs/accl/graph-gen.py rename to configs/accl/synth-graph-gen.py diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 7c89c1edea..82e63d512e 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -168,7 +168,7 @@ CenteralController::printAnswerToHostSimout() } pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize()); for (int i = 0; i < num_items; i++) { - std::string print = csprintf("WorklistItem[%lu][%d]: %s.", + std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i, items[i].to_string()); std::cout << print << std::endl; diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 116cdf3f77..eb2006a3df 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -76,16 +76,16 @@ WLEngine::registerMPU(MPU* mpu) owner = mpu; } -AddrRangeList +AddrRangeList WLEngine::getAddrRanges() -{ - return owner->getAddrRanges(); +{ + return owner->getAddrRanges(); } -void +void WLEngine::recvFunctional(PacketPtr pkt) -{ - owner->recvFunctional(pkt); +{ + owner->recvFunctional(pkt); } AddrRangeList diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 5f08678d26..7578044cbf 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -54,7 +54,7 @@ class WLEngine : public BaseReduceEngine public: RespPort(const std::string& name, WLEngine* owner, PortID id): - ResponsePort(name, owner), + ResponsePort(name, owner), owner(owner), needSendRetryReq(false), _id(id) {} virtual AddrRangeList getAddrRanges() const; From 76407f72953961561a153510f3dc81723f4847e1 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 11 Oct 2022 15:07:29 -0700 Subject: [PATCH 190/247] Adding functions to move value to and from float. --- src/accl/graph/base/data_structs.hh | 24 +++++++++++++++++++++++- src/accl/graph/sega/push_engine.cc | 13 ++++++------- src/accl/graph/sega/push_engine.hh | 11 ++++------- 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index 34c8eb98ce..3753e10d62 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -33,6 +33,8 @@ #include "base/intmath.hh" #include +#include +#include namespace gem5 { @@ -96,7 +98,7 @@ struct MetaEdge { uint32_t weight; uint32_t value; - MetaEdge(): src(0), dst(0), weight(0), value(0) + MetaEdge(): src(0), dst(0), weight(0), value(0) {} MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value): src(src), dst(dst), weight(weight), value(value) @@ -176,6 +178,26 @@ class UniqueFIFO } }; +template +float +writeToFloat(T value) +{ + assert(sizeof(T) == sizeof(float)); + float float_form; + std::memcpy(&float_form, &value, sizeof(float)); + return float_form; +} + +template +T +readFromFloat(float value) +{ + assert(sizeof(T) == sizeof(float)); + T float_bits; + std::memcpy(&float_bits, &value, sizeof(float)); + return float_bits; +} + } #endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__ diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index d5fb002f82..cd795eaf00 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -220,10 +220,9 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl) Addr start_addr = wl.edgeIndex * sizeof(Edge); Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); - edgePointerQueue.emplace_back( - start_addr, end_addr, sizeof(Edge), - peerMemoryAtomSize, addr, - (uint32_t) wl.prop, curTick()); + EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge), + peerMemoryAtomSize, addr, (uint32_t) wl.prop); + edgePointerQueue.emplace_back(info_gen, curTick()); numPendingPulls--; if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) { schedule(nextVertexPullEvent, nextCycle()); @@ -245,7 +244,8 @@ PushEngine::processNextMemoryReadEvent() Addr aligned_addr, offset; int num_edges; - EdgeReadInfoGen& curr_info = edgePointerQueue.front(); + EdgeReadInfoGen& curr_info = std::get<0>(edgePointerQueue.front()); + Tick entrance_tick = std::get<1>(edgePointerQueue.front()); std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo(); if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges))) { @@ -264,8 +264,7 @@ PushEngine::processNextMemoryReadEvent() if (curr_info.done()) { DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__); stats.edgePointerQueueLatency.sample( - (curTick() - curr_info.entrance()) * - 1e9 / getClockFrequency()); + (curTick() - entrance_tick) * 1e9 / getClockFrequency()); edgePointerQueue.pop_front(); DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. " "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size()); diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 6163ba5c27..acf012b24d 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -73,12 +73,11 @@ class PushEngine : public BaseMemoryEngine Addr _src; uint32_t _value; - Tick _entrance; public: EdgeReadInfoGen(Addr start, Addr end, size_t step, - size_t atom, Addr src, uint32_t value, Tick entrance): - _start(start), _end(end), _step(step), _atom(atom), - _src(src), _value(value), _entrance(entrance) + size_t atom, Addr src, uint32_t value): + _start(start), _end(end), _step(step), + _atom(atom), _src(src), _value(value) {} std::tuple nextReadPacketInfo() @@ -108,8 +107,6 @@ class PushEngine : public BaseMemoryEngine Addr src() { return _src; } uint32_t value() { return _value; } - - Tick entrance() { return _entrance; } }; struct PushInfo { Addr src; @@ -126,7 +123,7 @@ class PushEngine : public BaseMemoryEngine int numPendingPulls; int edgePointerQueueSize; - std::deque edgePointerQueue; + std::deque> edgePointerQueue; std::unordered_map reqInfoMap; int onTheFlyMemReqs; From 6413163e6f818ddc442e58c9302004c34bff1933 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Tue, 11 Oct 2022 15:54:40 -0700 Subject: [PATCH 191/247] Adding sssp and pr. --- src/accl/graph/sega/CoalesceEngine.py | 2 ++ src/accl/graph/sega/PushEngine.py | 3 ++ src/accl/graph/sega/coalesce_engine.cc | 29 ++++++++++--------- src/accl/graph/sega/coalesce_engine.hh | 1 + src/accl/graph/sega/push_engine.cc | 40 ++++++++++++++++++++++---- src/accl/graph/sega/push_engine.hh | 1 + src/accl/graph/sega/wl_engine.cc | 8 +++++- 7 files changed, 63 insertions(+), 21 deletions(-) diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index f6e997f1e3..eeba279b7a 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -44,3 +44,5 @@ class CoalesceEngine(BaseMemoryEngine): "requestor in each cycle. Used to limit b/w.") workload = Param.String("BFS", "Name of the workload") + + thereshold = Param.Float('0.0001', "Score threshold for Pagerank") diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py index 5e0d2b3212..52dc0e2506 100644 --- a/src/accl/graph/sega/PushEngine.py +++ b/src/accl/graph/sega/PushEngine.py @@ -51,3 +51,6 @@ class PushEngine(BaseMemoryEngine): "for each update queue.") out_ports = VectorRequestPort("Outgoing ports to all MPUs") + + alpha = Param.Float(0.8, "This parameter is specific to pagerank") + diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index f4cd6a950d..91072a1da8 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -48,7 +48,8 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), maxRespPerCycle(params.max_resp_per_cycle), - _workCount(0), numPullsReceived(0), workload(params.workload), + _workCount(0), numPullsReceived(0), + workload(params.workload), nextMemoryEvent([this] { processNextMemoryEvent(); }, name() + ".nextMemoryEvent"), @@ -110,16 +111,20 @@ CoalesceEngine::done() memoryFunctionQueue.empty() && (onTheFlyReqs == 0); } -uint32_t -CoalesceEngine::reduce(uint32_t update, uint32_t value) +bool +CoalesceEngine::applyCondition(uint32_t update, uint32_t value) { - uint32_t new_value; if(workload == "BFS"){ - new_value = std::min(update, value); + return update != value; + } else if (workload == "SSSP"){ + return update < value; + } else if (workload == "PR"){ + float float_value = writeToFloat(value); + float float_update = writeToFloat(update); + return params().thereshold <= abs(float_update - float_value); } else{ - panic("Workload not implemented\n"); + panic("The workload is not recognize"); } - return new_value; } // addr should be aligned to peerMemoryAtomSize @@ -639,7 +644,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) == (1 << wl_offset)); - if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) { + if (applyCondition( + wl.tempProp, cacheBlocks[block_index].items[wl_offset].tempProp)) { cacheBlocks[block_index].items[wl_offset] = wl; cacheBlocks[block_index].needsApply |= true; // NOTE: We don't set needsWB and rely on processNextApplyEvent to @@ -747,12 +753,7 @@ CoalesceEngine::processNextApplyEvent() assert(cacheBlocks[block_index].busyMask == 0); for (int index = 0; index < numElementsPerLine; index++) { uint32_t current_prop = cacheBlocks[block_index].items[index].prop; - // NOTE: It might be the case that for workloads other than BFS, - // the reduce function here should be different to the reduce - // function defined in WLEngine. Think about the case of PR in - // detail. - uint32_t new_prop = reduce( - cacheBlocks[block_index].items[index].tempProp, current_prop); + uint32_t new_prop = cacheBlocks[block_index].items[index].tempProp; if (new_prop != current_prop) { cacheBlocks[block_index].items[index].tempProp = new_prop; cacheBlocks[block_index].items[index].prop = new_prop; diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index b1f5b1fea1..a087f37b4d 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -131,6 +131,7 @@ class CoalesceEngine : public BaseMemoryEngine std::string workload; uint32_t reduce(uint32_t update, uint32_t value); + bool applyCondition(uint32_t update, uint32_t value); MemoryEvent nextMemoryEvent; void processNextMemoryEvent(); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index cd795eaf00..c9efa03f08 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -158,6 +158,10 @@ PushEngine::reduce(uint32_t update, uint32_t value) uint32_t new_value; if(workload == "BFS"){ new_value = std::min(update, value); + } else if(workload == "PR"){ + new_value = update + value; + } else if(workload == "SSSP"){ + new_value = std::min(update, value); } else{ panic("Workload not implemented\n"); } @@ -165,19 +169,42 @@ PushEngine::reduce(uint32_t update, uint32_t value) } uint32_t -PushEngine::propagate(uint32_t value, uint32_t weight) +PushEngine::propagate(uint32_t delta, uint32_t weight) { std::string workload = params().workload; uint32_t update; if (workload == "BFS") { - update = value + 1; - } - else{ + update = delta + 1; + } else if (workload == "SSSP") { + update = delta + weight; + } else if (workload == "PR") { + float float_form = writeToFloat(delta); + float float_update = float_form * weight * params().alpha; + update = readFromFloat(float_update); + } else{ panic("The workload %s is not supported", workload); } return update; } +uint32_t +PushEngine::calculateValue(WorkListItem wl) +{ + std::string workload = params().workload; + uint32_t delta; + if (workload == "PR") { + float property = writeToFloat(wl.prop) / wl.degree; + delta = readFromFloat(property); + } else if (workload == "BFS") { + delta = wl.prop; + } else if (workload == "SSSP") { + delta = wl.prop; + } else { + panic("Workload not supported."); + } + return delta; +} + void PushEngine::start() { @@ -220,9 +247,11 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl) Addr start_addr = wl.edgeIndex * sizeof(Edge); Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); + uint32_t value = calculateValue(wl); EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge), - peerMemoryAtomSize, addr, (uint32_t) wl.prop); + peerMemoryAtomSize, addr, value); edgePointerQueue.emplace_back(info_gen, curTick()); + numPendingPulls--; if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) { schedule(nextVertexPullEvent, nextCycle()); @@ -256,7 +285,6 @@ PushEngine::processNextMemoryReadEvent() PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize); PushInfo push_info = {curr_info.src(), curr_info.value(), offset, num_edges}; reqInfoMap[pkt->req] = push_info; - memPort.sendPacket(pkt); onTheFlyMemReqs += num_edges; diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index acf012b24d..c03e78851c 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -133,6 +133,7 @@ class PushEngine : public BaseMemoryEngine uint32_t reduce(uint32_t update, uint32_t value); uint32_t propagate(uint32_t value, uint32_t weight); + uint32_t calculateValue(WorkListItem wl); int updateQueueSize; template PacketPtr createUpdatePacket(Addr addr, T value); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index eb2006a3df..f684650f23 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -152,8 +152,14 @@ WLEngine::reduce(uint32_t update, uint32_t value) uint32_t new_value; if(workload == "BFS"){ new_value = std::min(update, value); + } else if(workload == "PR"){ + float float_value = writeToFloat(value); + float float_update = writeToFloat(update); + new_value = readFromFloat(float_update + float_value); + } else if(workload == "SSSP"){ + new_value = std::min(update, value); } else{ - panic("Workload not implemented\n"); + panic("Workload not implemented."); } return new_value; } From bdb42750389d6e308a726f2d100bb5757895e034 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Tue, 11 Oct 2022 21:23:27 -0700 Subject: [PATCH 192/247] making workload appropriate inits --- src/accl/graph/sega/CenteralController.py | 1 + src/accl/graph/sega/centeral_controller.cc | 17 +++++--- src/accl/graph/sega/centeral_controller.hh | 1 + src/accl/graph/sega/coalesce_engine.cc | 51 +++++++++++++++------- src/accl/graph/sega/coalesce_engine.hh | 2 +- 5 files changed, 50 insertions(+), 22 deletions(-) diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index 2ba53c231f..ebc8281641 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -43,5 +43,6 @@ class CenteralController(ClockedObject): cxx_exports = [ PyBindMethod("createInitialBFSUpdate"), + PyBindMethod("createInitialPRUpdate"), PyBindMethod("printAnswerToHostSimout") ] diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 82e63d512e..9231f96379 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -71,11 +71,8 @@ CenteralController::initState() [this](PacketPtr pkt) { for (auto mpu: mpuVector) { AddrRangeList range_list = addrRangeListMap[mpu]; - for (auto range: range_list) { - if (range.contains(pkt->getAddr())) { - mpu->recvFunctional(pkt); - break; - } + if (contains(range_list, pkt->getAddr())) { + mpu->recvFunctional(pkt); } } }, system->cacheLineSize()); @@ -139,6 +136,16 @@ CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value) initialUpdates.push_back(update); } +void +CenteralController::createInitialPRUpdate() +{ + for (auto mpu: mpuVector) { + if (!mpu->running() && (mpu->workCount() > 0)) { + mpu->start(); + } + } +} + void CenteralController::recvDoneSignal() { diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index d006851e3b..5b0f5d6816 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -64,6 +64,7 @@ class CenteralController : public ClockedObject virtual void startup() override; void createInitialBFSUpdate(Addr init_addr, uint32_t init_value); + void createInitialPRUpdate(); void recvDoneSignal(); void printAnswerToHostSimout(); diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 91072a1da8..92ad346b30 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -75,6 +75,40 @@ CoalesceEngine::registerMPU(MPU* mpu) owner = mpu; } +void +CoalesceEngine::algoInit(PacketPtr pkt) +{ + WorkListItem items[numElementsPerLine]; + pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + if(workload == "PR") { + //TODO: Add Alpha + int bit_index_base = getBitIndexBase(pkt->getAddr()); + for (int i = 0; i < numElementsPerLine; i++) { + items[i].tempProp = readFromFloat(1 - 0.2); + items[i].prop = readFromFloat(1 - 0.2); + needsPush[bit_index_base + i] = 1; + activeBits.push_back(bit_index_base + i); + } + } + pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize); +} + +bool +CoalesceEngine::applyCondition(uint32_t update, uint32_t value) +{ + if(workload == "BFS"){ + return update != value; + } else if (workload == "SSSP"){ + return update < value; + } else if (workload == "PR"){ + float float_value = writeToFloat(value); + float float_update = writeToFloat(update); + return params().thereshold <= abs(float_update - float_value); + } else{ + panic("The workload is not recognize"); + } +} + void CoalesceEngine::recvFunctional(PacketPtr pkt) { @@ -100,6 +134,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt) memPort.sendFunctional(pkt); } } else { + algoInit(pkt); memPort.sendFunctional(pkt); } } @@ -111,22 +146,6 @@ CoalesceEngine::done() memoryFunctionQueue.empty() && (onTheFlyReqs == 0); } -bool -CoalesceEngine::applyCondition(uint32_t update, uint32_t value) -{ - if(workload == "BFS"){ - return update != value; - } else if (workload == "SSSP"){ - return update < value; - } else if (workload == "PR"){ - float float_value = writeToFloat(value); - float float_update = writeToFloat(update); - return params().thereshold <= abs(float_update - float_value); - } else{ - panic("The workload is not recognize"); - } -} - // addr should be aligned to peerMemoryAtomSize int CoalesceEngine::getBlockIndex(Addr addr) diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index a087f37b4d..49ee441ed3 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -130,7 +130,7 @@ class CoalesceEngine : public BaseMemoryEngine std::unordered_map pendingVertexPullReads; std::string workload; - uint32_t reduce(uint32_t update, uint32_t value); + void algoInit(PacketPtr pkt); bool applyCondition(uint32_t update, uint32_t value); MemoryEvent nextMemoryEvent; From 5fa0c4c2376706e694afa3babbe2353baafd7440 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 12 Oct 2022 14:41:19 -0700 Subject: [PATCH 193/247] wip for implementing prewB and prePush apply functions. --- src/accl/graph/sega/CoalesceEngine.py | 7 ++- src/accl/graph/sega/WLEngine.py | 2 +- src/accl/graph/sega/coalesce_engine.cc | 61 +++++++++++++++++++------- src/accl/graph/sega/coalesce_engine.hh | 4 ++ src/accl/graph/sega/mpu.hh | 2 + src/accl/graph/sega/push_engine.hh | 2 + 6 files changed, 59 insertions(+), 19 deletions(-) diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index eeba279b7a..a50a814e89 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -43,6 +43,11 @@ class CoalesceEngine(BaseMemoryEngine): max_resp_per_cycle = Param.Int("Maximum number of vertices to send to " "requestor in each cycle. Used to limit b/w.") + post_apply_wb_queue_size = Param.Int("Maximum number of pending wb after " + "apply process for applications that require " + "the apply process to happen exactly before " + "pushing the edgePointer to the PushEngine.") + workload = Param.String("BFS", "Name of the workload") - thereshold = Param.Float('0.0001', "Score threshold for Pagerank") + threshold = Param.Float(0.0001, "Score threshold for Pagerank") diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py index 91325ab53f..7fe392cc9e 100644 --- a/src/accl/graph/sega/WLEngine.py +++ b/src/accl/graph/sega/WLEngine.py @@ -45,4 +45,4 @@ class WLEngine(BaseReduceEngine): "many updates as this queueu has " "entries at the same time.") - workload = Param.String('BFS',"Name of the workload") \ No newline at end of file + workload = Param.String("BFS","Name of the workload") diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 92ad346b30..4e1fe79899 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -48,7 +48,8 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), maxRespPerCycle(params.max_resp_per_cycle), - _workCount(0), numPullsReceived(0), + _workCount(0), numPullsReceived(0), + postApplyWBQueueSize(params.post_apply_wb_queue_size), workload(params.workload), nextMemoryEvent([this] { processNextMemoryEvent(); @@ -67,6 +68,16 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): cacheBlocks[i] = Block(numElementsPerLine); } needsPush.reset(); + + // TODO: Get rid of these booleans. + // applyBeforeWB = true; + // if (workload == "PR") { + // applyBeforeWB = false; + // } + // applyBeforePush = false; + // if (workload == "PR") { + // applyBeforePush = true; + // } } void @@ -84,7 +95,7 @@ CoalesceEngine::algoInit(PacketPtr pkt) //TODO: Add Alpha int bit_index_base = getBitIndexBase(pkt->getAddr()); for (int i = 0; i < numElementsPerLine; i++) { - items[i].tempProp = readFromFloat(1 - 0.2); + items[i].tempProp = readFromFloat(0); items[i].prop = readFromFloat(1 - 0.2); needsPush[bit_index_base + i] = 1; activeBits.push_back(bit_index_base + i); @@ -96,15 +107,15 @@ CoalesceEngine::algoInit(PacketPtr pkt) bool CoalesceEngine::applyCondition(uint32_t update, uint32_t value) { - if(workload == "BFS"){ + if(workload == "BFS") { return update != value; - } else if (workload == "SSSP"){ + } else if (workload == "SSSP") { return update < value; - } else if (workload == "PR"){ + } else if (workload == "PR") { float float_value = writeToFloat(value); float float_update = writeToFloat(update); - return params().thereshold <= abs(float_update - float_value); - } else{ + return params().threshold <= abs(float_update - float_value); + } else { panic("The workload is not recognize"); } } @@ -663,14 +674,15 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) == (1 << wl_offset)); - if (applyCondition( - wl.tempProp, cacheBlocks[block_index].items[wl_offset].tempProp)) { - cacheBlocks[block_index].items[wl_offset] = wl; - cacheBlocks[block_index].needsApply |= true; - // NOTE: We don't set needsWB and rely on processNextApplyEvent to - // set that bit. + if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) { + cacheBlocks[block_index].needsWB |= true; stats.numVertexWrites++; } + if (applyCondition(wl.tempProp, + cacheBlocks[block_index].items[wl_offset].prop)) { + cacheBlocks[block_index].needsApply |= true; + } + cacheBlocks[block_index].items[wl_offset] = wl; cacheBlocks[block_index].busyMask &= ~(1 << wl_offset); cacheBlocks[block_index].lastChangedTick = curTick(); @@ -773,10 +785,13 @@ CoalesceEngine::processNextApplyEvent() for (int index = 0; index < numElementsPerLine; index++) { uint32_t current_prop = cacheBlocks[block_index].items[index].prop; uint32_t new_prop = cacheBlocks[block_index].items[index].tempProp; - if (new_prop != current_prop) { - cacheBlocks[block_index].items[index].tempProp = new_prop; - cacheBlocks[block_index].items[index].prop = new_prop; - + if (applyCondition(new_prop, current_prop)) { + if (applyBeforeWB) { + cacheBlocks[block_index].items[index].tempProp = new_prop; + cacheBlocks[block_index].items[index].prop = new_prop; + } + // TODO: Implement this function + // bool do_push = preWBApply(cacheBlocks[block_index].items[index]); int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr); @@ -1046,6 +1061,18 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) needsPush[slice_base_index + wl_offset] = 0; _workCount--; + + // TODO: Implement a function like this. + // uint32_t delta, bool do_wb = prePushApply(cacheBlocks[block_index].items[wl_offset]); + // TODO: After implementing the above function get rid of this bool + // if (applyBeforePush) { + // cacheBlocks[block_index].items[wl_offset].prop = + // cacheBlocks[block_index].items[wl_offset].tempProp; + // } + // TODO: Implement recvVertexPush2 in PushEngine. + // owner->recvVertexPush2(vertex_addr, delta, + // cacheBlocks[block_index].items[wl_offset].edgeIndex, + // cacheBlocks[block_index].items[wl_offset].degree); owner->recvVertexPush( vertex_addr, cacheBlocks[block_index].items[wl_offset]); stats.verticesPushed++; diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 49ee441ed3..c9564ac187 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -114,11 +114,15 @@ class CoalesceEngine : public BaseMemoryEngine int maxRespPerCycle; std::deque> responseQueue; + bool applyBeforeWB; + bool applyBeforePush; int _workCount; int numPullsReceived; UniqueFIFO applyQueue; std::bitset needsPush; std::deque activeBits; + int postApplyWBQueueSize; + std::deque postApplyWBQueue; int getBlockIndex(Addr addr); int getBitIndexBase(Addr addr); diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index 229bd28950..9dcb9de5d7 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -75,6 +75,8 @@ class MPU : public SimObject bool running() { return pushEngine->running(); } void start() { return pushEngine->start(); } void recvVertexPush(Addr addr, WorkListItem wl); + void recvVertexPush2(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree); void recvDoneSignal(); bool done(); diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index c03e78851c..ec0dd09e43 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -199,6 +199,8 @@ class PushEngine : public BaseMemoryEngine void start(); bool running() { return _running; } void recvVertexPush(Addr addr, WorkListItem wl); + void recvVertexPush2(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree); void recvReqRetry(); From 2e1719a6537238b64337472dd0b5b741b07bc0c3 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Fri, 14 Oct 2022 16:24:09 -0700 Subject: [PATCH 194/247] Adding GraphWorkload class. --- configs/accl/sega-hbm.py | 7 +- src/accl/graph/base/SConscript | 1 + src/accl/graph/base/data_structs.hh | 3 +- src/accl/graph/base/graph_workload.cc | 66 ++++++++++++ src/accl/graph/base/graph_workload.hh | 74 +++++++++++++ src/accl/graph/sega/CenteralController.py | 1 + src/accl/graph/sega/centeral_controller.cc | 10 ++ src/accl/graph/sega/centeral_controller.hh | 4 + src/accl/graph/sega/coalesce_engine.cc | 76 +++++++------- src/accl/graph/sega/coalesce_engine.hh | 9 +- src/accl/graph/sega/mpu.cc | 8 ++ src/accl/graph/sega/mpu.hh | 1 + src/accl/graph/sega/push_engine.cc | 115 +++++++++++---------- src/accl/graph/sega/push_engine.hh | 5 +- src/accl/graph/sega/wl_engine.cc | 39 ++++--- src/accl/graph/sega/wl_engine.hh | 5 +- 16 files changed, 302 insertions(+), 122 deletions(-) create mode 100644 src/accl/graph/base/graph_workload.cc create mode 100644 src/accl/graph/base/graph_workload.hh diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py index cdc752f2bd..50fd5f3069 100644 --- a/configs/accl/sega-hbm.py +++ b/configs/accl/sega-hbm.py @@ -56,7 +56,8 @@ def __init__(self, edge_memory_size: str, cache_size: str): cache_size=cache_size, num_mshr_entry=64, num_tgts_per_mshr=64, - max_resp_per_cycle=8 + max_resp_per_cycle=8, + post_apply_wb_queue_size=64 ) self.push_engine = PushEngine( push_req_queue_size=32, @@ -135,6 +136,9 @@ def __init__(self, num_mpus, cache_size, graph_path): def create_initial_bfs_update(self, init_addr, init_value): self.ctrl.createInitialBFSUpdate(init_addr, init_value) + + def create_bfs_workload(self, init_addr, init_value): + self.ctrl.createBFSWorkload(init_addr, init_value) def print_answer(self): self.ctrl.printAnswerToHostSimout() @@ -166,6 +170,7 @@ def get_inputs(): m5.instantiate() system.create_initial_bfs_update(init_addr, init_value) + system.create_bfs_workload(init_addr, init_value) exit_event = m5.simulate() print(f"Exited simulation at tick {m5.curTick()} " + \ f"because {exit_event.getCause()}") diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript index 8b741abfc8..35111c34d2 100644 --- a/src/accl/graph/base/SConscript +++ b/src/accl/graph/base/SConscript @@ -30,3 +30,4 @@ Import("*") SimObject("BaseReduceEngine.py", sim_objects=["BaseReduceEngine"]) Source("base_reduce_engine.cc") +Source("graph_workload.cc") diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index 3753e10d62..2d81375b63 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -32,9 +32,10 @@ #include "base/cprintf.hh" #include "base/intmath.hh" -#include +#include #include #include +#include namespace gem5 { diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc new file mode 100644 index 0000000000..3d0d45b1de --- /dev/null +++ b/src/accl/graph/base/graph_workload.cc @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/base/graph_workload.hh" + +namespace gem5 +{ + +uint32_t +BFSWorkload::reduce(uint32_t update, uint32_t value) +{ + return std::min(update, value); +} + +uint32_t +BFSWorkload::propagate(uint32_t value, uint32_t weight) +{ + return value + 1; +} + +bool +BFSWorkload::applyCondition(WorkListItem wl) +{ + return wl.tempProp < wl.prop; +} + +bool +BFSWorkload::preWBApply(WorkListItem& wl) +{ + wl.prop = wl.tempProp; + return wl.degree > 0; +} + +std::tuple +BFSWorkload::prePushApply(WorkListItem& wl) +{ + uint32_t value = wl.prop; + return std::make_tuple(value, false); +} + +} // namespace gem5 diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh new file mode 100644 index 0000000000..304b434a3d --- /dev/null +++ b/src/accl/graph/base/graph_workload.hh @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__ +#define __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__ + +#include + +#include "accl/graph/base/data_structs.hh" + + +namespace gem5 +{ + +class GraphWorkload +{ + public: + GraphWorkload() {} + ~GraphWorkload() {} + virtual uint32_t reduce(uint32_t update, uint32_t value) = 0; + virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0; + virtual bool applyCondition(WorkListItem wl) = 0; + virtual bool preWBApply(WorkListItem& wl) = 0; + virtual std::tuple prePushApply(WorkListItem& wl) = 0; +}; + +class BFSWorkload : public GraphWorkload +{ + private: + uint64_t initAddr; + uint32_t initValue; + public: + BFSWorkload(uint64_t init_addr, uint32_t init_value): + GraphWorkload(), + initAddr(init_addr), initValue(init_value) + {} + + ~BFSWorkload() {} + + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual bool applyCondition(WorkListItem wl); + virtual bool preWBApply(WorkListItem& wl); + virtual std::tuple prePushApply(WorkListItem& wl); +}; + +} + +#endif // __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__ diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index ebc8281641..17badf9ec4 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -43,6 +43,7 @@ class CenteralController(ClockedObject): cxx_exports = [ PyBindMethod("createInitialBFSUpdate"), + PyBindMethod("createBFSWorkload"), PyBindMethod("createInitialPRUpdate"), PyBindMethod("printAnswerToHostSimout") ] diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 9231f96379..2074f69f08 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -83,6 +83,10 @@ CenteralController::initState() void CenteralController::startup() { + for (auto mpu: mpuVector) { + mpu->recvWorkload(workload); + } + while(!initialUpdates.empty()) { PacketPtr front = initialUpdates.front(); for (auto mpu: mpuVector) { @@ -136,6 +140,12 @@ CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value) initialUpdates.push_back(update); } +void +CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value) +{ + workload = new BFSWorkload(init_addr, init_value); +} + void CenteralController::createInitialPRUpdate() { diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index 5b0f5d6816..1f1df00b4b 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -32,6 +32,7 @@ #include #include "accl/graph/base/data_structs.hh" +#include "accl/graph/base/graph_workload.hh" #include "accl/graph/sega/mpu.hh" #include "base/addr_range.hh" #include "debug/FinalAnswer.hh" @@ -47,6 +48,8 @@ class CenteralController : public ClockedObject private: System* system; + GraphWorkload* workload; + Addr maxVertexAddr; std::deque initialUpdates; @@ -64,6 +67,7 @@ class CenteralController : public ClockedObject virtual void startup() override; void createInitialBFSUpdate(Addr init_addr, uint32_t init_value); + void createBFSWorkload(Addr init_addr, uint32_t init_value); void createInitialPRUpdate(); void recvDoneSignal(); diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 4e1fe79899..20bfaf8481 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -68,16 +68,6 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): cacheBlocks[i] = Block(numElementsPerLine); } needsPush.reset(); - - // TODO: Get rid of these booleans. - // applyBeforeWB = true; - // if (workload == "PR") { - // applyBeforeWB = false; - // } - // applyBeforePush = false; - // if (workload == "PR") { - // applyBeforePush = true; - // } } void @@ -90,9 +80,10 @@ void CoalesceEngine::algoInit(PacketPtr pkt) { WorkListItem items[numElementsPerLine]; - pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + if(workload == "PR") { //TODO: Add Alpha + pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); int bit_index_base = getBitIndexBase(pkt->getAddr()); for (int i = 0; i < numElementsPerLine; i++) { items[i].tempProp = readFromFloat(0); @@ -100,25 +91,39 @@ CoalesceEngine::algoInit(PacketPtr pkt) needsPush[bit_index_base + i] = 1; activeBits.push_back(bit_index_base + i); } + pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize); } - pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize); + } -bool -CoalesceEngine::applyCondition(uint32_t update, uint32_t value) -{ - if(workload == "BFS") { - return update != value; - } else if (workload == "SSSP") { - return update < value; - } else if (workload == "PR") { - float float_value = writeToFloat(value); - float float_update = writeToFloat(update); - return params().threshold <= abs(float_update - float_value); - } else { - panic("The workload is not recognize"); - } -} +// bool +// CoalesceEngine::applyCondition(WorkListItem wl) +// { +// if (workload == "BFS") { +// return wl.tempProp != wl.prop; +// } else if (workload == "SSSP") { +// return wl.tempProp < wl.prop; +// } else if (workload == "PR") { +// float float_temp = writeToFloat(wl.tempProp); +// float float_prop = writeToFloat(wl.prop); +// return params().threshold <= abs(float_prop - float_temp); +// } else { +// panic("The workload is not recognized."); +// } +// } + +// bool +// CoalesceEngine::preWBApply(WorkListItem& wl) +// { +// if (workload == "BFS") { +// uint32_t new_prop = std::min(wl.tempProp, wl.prop); +// wl.tempProp = new_prop; +// wl.prop = new_prop; +// return wl.degree > 0; +// } else { +// panic("The workload is not recognized."); +// } +// } void CoalesceEngine::recvFunctional(PacketPtr pkt) @@ -678,11 +683,10 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) cacheBlocks[block_index].needsWB |= true; stats.numVertexWrites++; } - if (applyCondition(wl.tempProp, - cacheBlocks[block_index].items[wl_offset].prop)) { + cacheBlocks[block_index].items[wl_offset] = wl; + if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) { cacheBlocks[block_index].needsApply |= true; } - cacheBlocks[block_index].items[wl_offset] = wl; cacheBlocks[block_index].busyMask &= ~(1 << wl_offset); cacheBlocks[block_index].lastChangedTick = curTick(); @@ -783,19 +787,13 @@ CoalesceEngine::processNextApplyEvent() if (cacheBlocks[block_index].pendingApply) { assert(cacheBlocks[block_index].busyMask == 0); for (int index = 0; index < numElementsPerLine; index++) { - uint32_t current_prop = cacheBlocks[block_index].items[index].prop; - uint32_t new_prop = cacheBlocks[block_index].items[index].tempProp; - if (applyCondition(new_prop, current_prop)) { - if (applyBeforeWB) { - cacheBlocks[block_index].items[index].tempProp = new_prop; - cacheBlocks[block_index].items[index].prop = new_prop; - } + if (graphWorkload->applyCondition(cacheBlocks[block_index].items[index])) { // TODO: Implement this function - // bool do_push = preWBApply(cacheBlocks[block_index].items[index]); + bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]); int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr); - if (cacheBlocks[block_index].items[index].degree > 0) { + if (do_push) { if (needsPush[bit_index_base + index] == 0) { _workCount++; needsPush[bit_index_base + index] = 1; diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index c9564ac187..3492cab9dc 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -31,8 +31,9 @@ #include -#include "accl/graph/sega/base_memory_engine.hh" #include "accl/graph/base/data_structs.hh" +#include "accl/graph/base/graph_workload.hh" +#include "accl/graph/sega/base_memory_engine.hh" #include "base/cprintf.hh" #include "base/statistics.hh" #include "params/CoalesceEngine.hh" @@ -134,8 +135,11 @@ class CoalesceEngine : public BaseMemoryEngine std::unordered_map pendingVertexPullReads; std::string workload; + GraphWorkload* graphWorkload; + void algoInit(PacketPtr pkt); - bool applyCondition(uint32_t update, uint32_t value); + bool applyCondition(WorkListItem wl); + bool preWBApply(WorkListItem& wl); MemoryEvent nextMemoryEvent; void processNextMemoryEvent(); @@ -203,6 +207,7 @@ class CoalesceEngine : public BaseMemoryEngine CoalesceEngine(const Params ¶ms); void registerMPU(MPU* mpu); + void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; } virtual void recvFunctional(PacketPtr pkt); bool recvWLRead(Addr addr); diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc index 44054d1efb..70f1e05f32 100644 --- a/src/accl/graph/sega/mpu.cc +++ b/src/accl/graph/sega/mpu.cc @@ -71,6 +71,14 @@ MPU::recvWLWrite(Addr addr, WorkListItem wl) coalesceEngine->recvWLWrite(addr, wl); } +void +MPU::recvWorkload(GraphWorkload* workload) +{ + coalesceEngine->recvWorkload(workload); + pushEngine->recvWorkload(workload); + wlEngine->recvWorkload(workload); +} + void MPU::recvVertexPush(Addr addr, WorkListItem wl) { diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index 9dcb9de5d7..8f6101c325 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -69,6 +69,7 @@ class MPU : public SimObject void handleIncomingWL(Addr addr, WorkListItem wl); bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); } void recvWLWrite(Addr addr, WorkListItem wl); + void recvWorkload(GraphWorkload* Workload); int workCount() { return coalesceEngine->workCount(); } void recvVertexPull() { return coalesceEngine->recvVertexPull(); } diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index c9efa03f08..a661a755b7 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -151,59 +151,59 @@ PushEngine::done() (onTheFlyMemReqs == 0) && edgePointerQueue.empty(); } -uint32_t -PushEngine::reduce(uint32_t update, uint32_t value) -{ - std::string workload = params().workload; - uint32_t new_value; - if(workload == "BFS"){ - new_value = std::min(update, value); - } else if(workload == "PR"){ - new_value = update + value; - } else if(workload == "SSSP"){ - new_value = std::min(update, value); - } else{ - panic("Workload not implemented\n"); - } - return new_value; -} - -uint32_t -PushEngine::propagate(uint32_t delta, uint32_t weight) -{ - std::string workload = params().workload; - uint32_t update; - if (workload == "BFS") { - update = delta + 1; - } else if (workload == "SSSP") { - update = delta + weight; - } else if (workload == "PR") { - float float_form = writeToFloat(delta); - float float_update = float_form * weight * params().alpha; - update = readFromFloat(float_update); - } else{ - panic("The workload %s is not supported", workload); - } - return update; -} - -uint32_t -PushEngine::calculateValue(WorkListItem wl) -{ - std::string workload = params().workload; - uint32_t delta; - if (workload == "PR") { - float property = writeToFloat(wl.prop) / wl.degree; - delta = readFromFloat(property); - } else if (workload == "BFS") { - delta = wl.prop; - } else if (workload == "SSSP") { - delta = wl.prop; - } else { - panic("Workload not supported."); - } - return delta; -} +// uint32_t +// PushEngine::reduce(uint32_t update, uint32_t value) +// { +// std::string workload = params().workload; +// uint32_t new_value; +// if(workload == "BFS"){ +// new_value = std::min(update, value); +// } else if(workload == "PR"){ +// new_value = update + value; +// } else if(workload == "SSSP"){ +// new_value = std::min(update, value); +// } else{ +// panic("Workload not implemented\n"); +// } +// return new_value; +// } + +// uint32_t +// PushEngine::propagate(uint32_t delta, uint32_t weight) +// { +// std::string workload = params().workload; +// uint32_t update; +// if (workload == "BFS") { +// update = delta + 1; +// } else if (workload == "SSSP") { +// update = delta + weight; +// } else if (workload == "PR") { +// float float_form = writeToFloat(delta); +// float float_update = float_form * weight * params().alpha; +// update = readFromFloat(float_update); +// } else{ +// panic("The workload %s is not supported", workload); +// } +// return update; +// } + +// uint32_t +// PushEngine::calculateValue(WorkListItem wl) +// { +// std::string workload = params().workload; +// uint32_t delta; +// if (workload == "PR") { +// float property = writeToFloat(wl.prop) / wl.degree; +// delta = readFromFloat(property); +// } else if (workload == "BFS") { +// delta = wl.prop; +// } else if (workload == "SSSP") { +// delta = wl.prop; +// } else { +// panic("Workload not supported."); +// } +// return delta; +// } void PushEngine::start() @@ -247,9 +247,9 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl) Addr start_addr = wl.edgeIndex * sizeof(Edge); Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); - uint32_t value = calculateValue(wl); + // uint32_t value = calculateValue(wl); EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge), - peerMemoryAtomSize, addr, value); + peerMemoryAtomSize, addr, wl.prop); edgePointerQueue.emplace_back(info_gen, curTick()); numPendingPulls--; @@ -364,7 +364,8 @@ PushEngine::processNextPropagateEvent() DPRINTF(PushEngine, "%s: The edge to process is %s.\n", __func__, meta_edge.to_string()); - uint32_t update_value = propagate(meta_edge.value, meta_edge.weight); + uint32_t update_value = + graphWorkload->propagate(meta_edge.value, meta_edge.weight); Update update(meta_edge.src, meta_edge.dst, update_value); metaEdgeQueue.pop_front(); @@ -419,7 +420,7 @@ PushEngine::enqueueUpdate(Update update) Update& curr_update = std::get<0>(entry); if (curr_update.dst == update.dst) { uint32_t old_value = curr_update.value; - curr_update.value = reduce(old_value, update.value); + curr_update.value = graphWorkload->reduce(old_value, update.value); DPRINTF(PushEngine, "%s: found a coalescing opportunity " "for destination %d with new value: %d by " "coalescing %d and %d. \n", __func__, update.dst, diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index ec0dd09e43..47db96d818 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -29,8 +29,9 @@ #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ -#include "accl/graph/sega/base_memory_engine.hh" #include "accl/graph/base/data_structs.hh" +#include "accl/graph/base/graph_workload.hh" +#include "accl/graph/sega/base_memory_engine.hh" #include "base/intmath.hh" #include "params/PushEngine.hh" @@ -115,6 +116,7 @@ class PushEngine : public BaseMemoryEngine int numElements; }; MPU* owner; + GraphWorkload* graphWorkload; bool _running; Tick lastIdleEntranceTick; @@ -194,6 +196,7 @@ class PushEngine : public BaseMemoryEngine virtual void init() override; void registerMPU(MPU* mpu); + void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; } virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } void start(); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index f684650f23..86acd40b69 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -41,7 +41,6 @@ WLEngine::WLEngine(const WLEngineParams& params): BaseReduceEngine(params), updateQueueSize(params.update_queue_size), registerFileSize(params.register_file_size), - workload(params.workload), nextReadEvent([this]{ processNextReadEvent(); }, name()), nextReduceEvent([this]{ processNextReduceEvent(); }, name()), stats(*this) @@ -146,23 +145,23 @@ WLEngine::done() return registerFile.empty() && updateQueue.empty(); } -uint32_t -WLEngine::reduce(uint32_t update, uint32_t value) -{ - uint32_t new_value; - if(workload == "BFS"){ - new_value = std::min(update, value); - } else if(workload == "PR"){ - float float_value = writeToFloat(value); - float float_update = writeToFloat(update); - new_value = readFromFloat(float_update + float_value); - } else if(workload == "SSSP"){ - new_value = std::min(update, value); - } else{ - panic("Workload not implemented."); - } - return new_value; -} +// uint32_t +// WLEngine::reduce(uint32_t update, uint32_t value) +// { +// uint32_t new_value; +// if(workload == "BFS"){ +// new_value = std::min(update, value); +// } else if(workload == "PR"){ +// float float_value = writeToFloat(value); +// float float_update = writeToFloat(update); +// new_value = readFromFloat(float_update + float_value); +// } else if(workload == "SSSP"){ +// new_value = std::min(update, value); +// } else{ +// panic("Workload not implemented."); +// } +// return new_value; +// } bool WLEngine::handleIncomingUpdate(PacketPtr pkt) @@ -251,7 +250,7 @@ WLEngine::processNextReadEvent() "addr: %lu in registerFile. registerFile[%lu] = %u.\n", __func__, update_addr, update_addr, registerFile[update_addr]); registerFile[update_addr] = - reduce(update_value, registerFile[update_addr]); + graphWorkload->reduce(update_value, registerFile[update_addr]); DPRINTF(WLEngine, "%s: Reduced the update_value: %u with the entry in" " registerFile. registerFile[%lu] = %u.\n", __func__, update_value, update_addr, registerFile[update_addr]); @@ -310,7 +309,7 @@ WLEngine::processNextReduceEvent() addr, workListFile[addr].to_string()); // TODO: Generalize this to reduce function rather than just min workListFile[addr].tempProp = - reduce(update_value, workListFile[addr].tempProp); + graphWorkload->reduce(update_value, workListFile[addr].tempProp); DPRINTF(WLEngine, "%s: Reduction done. workListFile[%lu] = %s.\n", __func__, addr, workListFile[addr].to_string()); stats.numReduce++; diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 7578044cbf..0d0e532269 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -33,6 +33,7 @@ #include #include "accl/graph/base/base_reduce_engine.hh" +#include "accl/graph/base/graph_workload.hh" #include "accl/graph/base/data_structs.hh" #include "base/statistics.hh" #include "params/WLEngine.hh" @@ -70,7 +71,8 @@ class WLEngine : public BaseReduceEngine }; MPU* owner; - + GraphWorkload* graphWorkload; + std::vector inPorts; int updateQueueSize; @@ -118,6 +120,7 @@ class WLEngine : public BaseReduceEngine void registerMPU(MPU* mpu); AddrRangeList getAddrRanges(); + void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; } void recvFunctional(PacketPtr pkt); bool handleIncomingUpdate(PacketPtr pkt); From fba3e575719072c9dec328df5c6f0603bb9d7c6f Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 15 Oct 2022 16:59:05 -0700 Subject: [PATCH 195/247] Cleaning up. --- src/accl/graph/sega/CoalesceEngine.py | 3 - src/accl/graph/sega/PushEngine.py | 7 +-- src/accl/graph/sega/WLEngine.py | 2 - src/accl/graph/sega/centeral_controller.cc | 5 +- src/accl/graph/sega/coalesce_engine.cc | 64 ++++++---------------- src/accl/graph/sega/coalesce_engine.hh | 8 +-- src/accl/graph/sega/push_engine.cc | 58 +------------------- src/accl/graph/sega/push_engine.hh | 4 -- src/accl/graph/sega/wl_engine.cc | 18 ------ src/accl/graph/sega/wl_engine.hh | 6 +- 10 files changed, 23 insertions(+), 152 deletions(-) diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index a50a814e89..d462d618e6 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -48,6 +48,3 @@ class CoalesceEngine(BaseMemoryEngine): "the apply process to happen exactly before " "pushing the edgePointer to the PushEngine.") - workload = Param.String("BFS", "Name of the workload") - - threshold = Param.Float(0.0001, "Score threshold for Pagerank") diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py index 52dc0e2506..20c5452d43 100644 --- a/src/accl/graph/sega/PushEngine.py +++ b/src/accl/graph/sega/PushEngine.py @@ -34,8 +34,6 @@ class PushEngine(BaseMemoryEngine): cxx_header = "accl/graph/sega/push_engine.hh" cxx_class = 'gem5::PushEngine' - workload = Param.String("BFS", "Name of the workload.") - push_req_queue_size = Param.Int("Size of the queue to " "queue push requests.") # resp_queue_size should probably be @@ -43,7 +41,7 @@ class PushEngine(BaseMemoryEngine): resp_queue_size = Param.Int("Size of the response queue in the " "push engine where it stores the " "edges read from memory.") - + max_propagates_per_cycle = Param.Int(4, "Maximum number of propagates " "done per cycle.") @@ -51,6 +49,3 @@ class PushEngine(BaseMemoryEngine): "for each update queue.") out_ports = VectorRequestPort("Outgoing ports to all MPUs") - - alpha = Param.Float(0.8, "This parameter is specific to pagerank") - diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py index 7fe392cc9e..5a8ed9c9fd 100644 --- a/src/accl/graph/sega/WLEngine.py +++ b/src/accl/graph/sega/WLEngine.py @@ -44,5 +44,3 @@ class WLEngine(BaseReduceEngine): "WLEngine has. It can service as " "many updates as this queueu has " "entries at the same time.") - - workload = Param.String("BFS","Name of the workload") diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 2074f69f08..fd282834e9 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -55,6 +55,7 @@ CenteralController::initState() { for (auto mpu: mpuVector) { addrRangeListMap[mpu] = mpu->getAddrRanges(); + mpu->recvWorkload(workload); } const auto& file = params().image_file; if (file == "") @@ -83,10 +84,6 @@ CenteralController::initState() void CenteralController::startup() { - for (auto mpu: mpuVector) { - mpu->recvWorkload(workload); - } - while(!initialUpdates.empty()) { PacketPtr front = initialUpdates.front(); for (auto mpu: mpuVector) { diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 20bfaf8481..fa5099353e 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -50,7 +50,6 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): maxRespPerCycle(params.max_resp_per_cycle), _workCount(0), numPullsReceived(0), postApplyWBQueueSize(params.post_apply_wb_queue_size), - workload(params.workload), nextMemoryEvent([this] { processNextMemoryEvent(); }, name() + ".nextMemoryEvent"), @@ -76,52 +75,22 @@ CoalesceEngine::registerMPU(MPU* mpu) owner = mpu; } -void -CoalesceEngine::algoInit(PacketPtr pkt) -{ - WorkListItem items[numElementsPerLine]; - - if(workload == "PR") { - //TODO: Add Alpha - pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); - int bit_index_base = getBitIndexBase(pkt->getAddr()); - for (int i = 0; i < numElementsPerLine; i++) { - items[i].tempProp = readFromFloat(0); - items[i].prop = readFromFloat(1 - 0.2); - needsPush[bit_index_base + i] = 1; - activeBits.push_back(bit_index_base + i); - } - pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize); - } - -} - -// bool -// CoalesceEngine::applyCondition(WorkListItem wl) -// { -// if (workload == "BFS") { -// return wl.tempProp != wl.prop; -// } else if (workload == "SSSP") { -// return wl.tempProp < wl.prop; -// } else if (workload == "PR") { -// float float_temp = writeToFloat(wl.tempProp); -// float float_prop = writeToFloat(wl.prop); -// return params().threshold <= abs(float_prop - float_temp); -// } else { -// panic("The workload is not recognized."); -// } -// } - -// bool -// CoalesceEngine::preWBApply(WorkListItem& wl) +// void +// CoalesceEngine::algoInit(PacketPtr pkt) // { -// if (workload == "BFS") { -// uint32_t new_prop = std::min(wl.tempProp, wl.prop); -// wl.tempProp = new_prop; -// wl.prop = new_prop; -// return wl.degree > 0; -// } else { -// panic("The workload is not recognized."); +// WorkListItem items[numElementsPerLine]; + +// if(workload == "PR") { +// //TODO: Add Alpha +// pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); +// int bit_index_base = getBitIndexBase(pkt->getAddr()); +// for (int i = 0; i < numElementsPerLine; i++) { +// items[i].tempProp = readFromFloat(0); +// items[i].prop = readFromFloat(1 - 0.2); +// needsPush[bit_index_base + i] = 1; +// activeBits.push_back(bit_index_base + i); +// } +// pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize); // } // } @@ -150,7 +119,8 @@ CoalesceEngine::recvFunctional(PacketPtr pkt) memPort.sendFunctional(pkt); } } else { - algoInit(pkt); + // TODO: Add and implement init function for GraphWorkload. + // graphWorkload->init(pkt); memPort.sendFunctional(pkt); } } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 3492cab9dc..0a2c0ca5ff 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -103,6 +103,7 @@ class CoalesceEngine : public BaseMemoryEngine SenderState(bool is_retry): isRetry(is_retry) {} }; MPU* owner; + GraphWorkload* graphWorkload; int numLines; int numElementsPerLine; @@ -134,13 +135,6 @@ class CoalesceEngine : public BaseMemoryEngine // send for push when getting the read response from memory. std::unordered_map pendingVertexPullReads; - std::string workload; - GraphWorkload* graphWorkload; - - void algoInit(PacketPtr pkt); - bool applyCondition(WorkListItem wl); - bool preWBApply(WorkListItem& wl); - MemoryEvent nextMemoryEvent; void processNextMemoryEvent(); void processNextRead(int block_index, Tick schedule_tick); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index a661a755b7..c54f19307f 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -151,60 +151,6 @@ PushEngine::done() (onTheFlyMemReqs == 0) && edgePointerQueue.empty(); } -// uint32_t -// PushEngine::reduce(uint32_t update, uint32_t value) -// { -// std::string workload = params().workload; -// uint32_t new_value; -// if(workload == "BFS"){ -// new_value = std::min(update, value); -// } else if(workload == "PR"){ -// new_value = update + value; -// } else if(workload == "SSSP"){ -// new_value = std::min(update, value); -// } else{ -// panic("Workload not implemented\n"); -// } -// return new_value; -// } - -// uint32_t -// PushEngine::propagate(uint32_t delta, uint32_t weight) -// { -// std::string workload = params().workload; -// uint32_t update; -// if (workload == "BFS") { -// update = delta + 1; -// } else if (workload == "SSSP") { -// update = delta + weight; -// } else if (workload == "PR") { -// float float_form = writeToFloat(delta); -// float float_update = float_form * weight * params().alpha; -// update = readFromFloat(float_update); -// } else{ -// panic("The workload %s is not supported", workload); -// } -// return update; -// } - -// uint32_t -// PushEngine::calculateValue(WorkListItem wl) -// { -// std::string workload = params().workload; -// uint32_t delta; -// if (workload == "PR") { -// float property = writeToFloat(wl.prop) / wl.degree; -// delta = readFromFloat(property); -// } else if (workload == "BFS") { -// delta = wl.prop; -// } else if (workload == "SSSP") { -// delta = wl.prop; -// } else { -// panic("Workload not supported."); -// } -// return delta; -// } - void PushEngine::start() { @@ -251,7 +197,7 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl) EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge), peerMemoryAtomSize, addr, wl.prop); edgePointerQueue.emplace_back(info_gen, curTick()); - + numPendingPulls--; if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) { schedule(nextVertexPullEvent, nextCycle()); @@ -364,7 +310,7 @@ PushEngine::processNextPropagateEvent() DPRINTF(PushEngine, "%s: The edge to process is %s.\n", __func__, meta_edge.to_string()); - uint32_t update_value = + uint32_t update_value = graphWorkload->propagate(meta_edge.value, meta_edge.weight); Update update(meta_edge.src, meta_edge.dst, update_value); metaEdgeQueue.pop_front(); diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 47db96d818..1112176897 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -133,10 +133,6 @@ class PushEngine : public BaseMemoryEngine int maxPropagatesPerCycle; std::deque> metaEdgeQueue; - uint32_t reduce(uint32_t update, uint32_t value); - uint32_t propagate(uint32_t value, uint32_t weight); - uint32_t calculateValue(WorkListItem wl); - int updateQueueSize; template PacketPtr createUpdatePacket(Addr addr, T value); bool enqueueUpdate(Update update); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 86acd40b69..85fe9be2ca 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -145,24 +145,6 @@ WLEngine::done() return registerFile.empty() && updateQueue.empty(); } -// uint32_t -// WLEngine::reduce(uint32_t update, uint32_t value) -// { -// uint32_t new_value; -// if(workload == "BFS"){ -// new_value = std::min(update, value); -// } else if(workload == "PR"){ -// float float_value = writeToFloat(value); -// float float_update = writeToFloat(update); -// new_value = readFromFloat(float_update + float_value); -// } else if(workload == "SSSP"){ -// new_value = std::min(update, value); -// } else{ -// panic("Workload not implemented."); -// } -// return new_value; -// } - bool WLEngine::handleIncomingUpdate(PacketPtr pkt) { diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 0d0e532269..f442d6060e 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -72,7 +72,7 @@ class WLEngine : public BaseReduceEngine MPU* owner; GraphWorkload* graphWorkload; - + std::vector inPorts; int updateQueueSize; @@ -81,12 +81,8 @@ class WLEngine : public BaseReduceEngine int registerFileSize; std::unordered_map registerFile; std::unordered_map vertexReadTime; - std::unordered_map workListFile; - std::string workload; - uint32_t reduce(uint32_t update, uint32_t value); - EventFunctionWrapper nextReadEvent; void processNextReadEvent(); From 01ab8f8809451179d27f3f5da7be57675161f4e7 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 16 Oct 2022 17:05:07 -0700 Subject: [PATCH 196/247] Implementing post push wb buffer. --- src/accl/graph/base/graph_workload.cc | 19 +- src/accl/graph/base/graph_workload.hh | 6 +- src/accl/graph/sega/CoalesceEngine.py | 2 +- src/accl/graph/sega/coalesce_engine.cc | 239 +++++++++++++++++-------- src/accl/graph/sega/coalesce_engine.hh | 10 +- src/accl/graph/sega/mpu.cc | 12 +- src/accl/graph/sega/mpu.hh | 4 +- src/accl/graph/sega/push_engine.cc | 17 +- src/accl/graph/sega/push_engine.hh | 23 ++- 9 files changed, 223 insertions(+), 109 deletions(-) diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 3d0d45b1de..6a8e000515 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -28,10 +28,10 @@ #include "accl/graph/base/graph_workload.hh" -namespace gem5 +namespace gem5 { -uint32_t +uint32_t BFSWorkload::reduce(uint32_t update, uint32_t value) { return std::min(update, value); @@ -43,7 +43,7 @@ BFSWorkload::propagate(uint32_t value, uint32_t weight) return value + 1; } -bool +bool BFSWorkload::applyCondition(WorkListItem wl) { return wl.tempProp < wl.prop; @@ -52,15 +52,20 @@ BFSWorkload::applyCondition(WorkListItem wl) bool BFSWorkload::preWBApply(WorkListItem& wl) { - wl.prop = wl.tempProp; - return wl.degree > 0; + if (applyCondition(wl)) { + wl.prop = wl.tempProp; + if (wl.degree > 0) { + return true; + } + } + return false; } -std::tuple +std::tuple BFSWorkload::prePushApply(WorkListItem& wl) { uint32_t value = wl.prop; - return std::make_tuple(value, false); + return std::make_tuple(value, true, false); } } // namespace gem5 diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh index 304b434a3d..c4db5c9e2f 100644 --- a/src/accl/graph/base/graph_workload.hh +++ b/src/accl/graph/base/graph_workload.hh @@ -46,7 +46,7 @@ class GraphWorkload virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0; virtual bool applyCondition(WorkListItem wl) = 0; virtual bool preWBApply(WorkListItem& wl) = 0; - virtual std::tuple prePushApply(WorkListItem& wl) = 0; + virtual std::tuple prePushApply(WorkListItem& wl) = 0; }; class BFSWorkload : public GraphWorkload @@ -56,7 +56,7 @@ class BFSWorkload : public GraphWorkload uint32_t initValue; public: BFSWorkload(uint64_t init_addr, uint32_t init_value): - GraphWorkload(), + GraphWorkload(), initAddr(init_addr), initValue(init_value) {} @@ -66,7 +66,7 @@ class BFSWorkload : public GraphWorkload virtual uint32_t propagate(uint32_t value, uint32_t weight); virtual bool applyCondition(WorkListItem wl); virtual bool preWBApply(WorkListItem& wl); - virtual std::tuple prePushApply(WorkListItem& wl); + virtual std::tuple prePushApply(WorkListItem& wl); }; } diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index d462d618e6..1fd3b968c5 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -43,7 +43,7 @@ class CoalesceEngine(BaseMemoryEngine): max_resp_per_cycle = Param.Int("Maximum number of vertices to send to " "requestor in each cycle. Used to limit b/w.") - post_apply_wb_queue_size = Param.Int("Maximum number of pending wb after " + post_push_wb_queue_size = Param.Int("Maximum number of pending wb after " "apply process for applications that require " "the apply process to happen exactly before " "pushing the edgePointer to the PushEngine.") diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index fa5099353e..0c223a8a5b 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -49,16 +49,17 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): numTgtsPerMSHR(params.num_tgts_per_mshr), maxRespPerCycle(params.max_resp_per_cycle), _workCount(0), numPullsReceived(0), - postApplyWBQueueSize(params.post_apply_wb_queue_size), + postPushWBQueueSize(params.post_push_wb_queue_size), + maxPotentialPostPushWB(0), nextMemoryEvent([this] { processNextMemoryEvent(); }, name() + ".nextMemoryEvent"), nextResponseEvent([this] { processNextResponseEvent(); }, name() + ".nextResponseEvent"), - nextApplyEvent([this] { - processNextApplyEvent(); - }, name() + ".nextApplyEvent"), + nextPreWBApplyEvent([this] { + processNextPreWBApplyEvent(); + }, name() + ".nextPreWBApplyEvent"), stats(*this) { assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine)); @@ -469,7 +470,9 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) onTheFlyReqs--; Addr addr = pkt->getAddr(); int block_index = getBlockIndex(addr); + WorkListItem* items = pkt->getPtr(); + bool do_wb = false; if (pkt->findNextSenderState()) { assert(!((cacheBlocks[block_index].addr == addr) && (cacheBlocks[block_index].valid))); @@ -480,7 +483,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) "for addr %lu.\n", __func__, addr); int it = getBitIndexBase(addr); uint64_t send_mask = pendingVertexPullReads[addr]; - WorkListItem* items = pkt->getPtr(); // No applying of the line needed. for (int i = 0; i < numElementsPerLine; i++) { Addr vertex_addr = addr + i * sizeof(WorkListItem); @@ -489,19 +491,30 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) assert(needsPush[it + i] == 1); needsPush[it + i] = 0; _workCount--; - owner->recvVertexPush(vertex_addr, items[i]); + + uint32_t delta; + bool do_push, do_wb_v; + std::tie(delta, do_push, do_wb_v) = + graphWorkload->prePushApply(items[i]); + do_wb |= do_wb_v; + if (do_push) { + owner->recvVertexPush(vertex_addr, delta, + items[i].edgeIndex, items[i].degree); + } else { + owner->recvPrevPullCorrection(); + } + stats.verticesPushed++; stats.lastVertexPushTime = curTick() - stats.lastResetTick; } } pendingVertexPullReads.erase(addr); - delete pkt; - return true; + maxPotentialPostPushWB--; } if (cacheBlocks[block_index].addr == addr) { DPRINTF(CoalesceEngine, "%s: Received read response to " - "fill cacheBlocks[%d].\n", __func__, block_index); + "fill cacheBlocks[%d].\n", __func__, block_index); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); assert(!cacheBlocks[block_index].valid); @@ -512,19 +525,30 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) assert(!cacheBlocks[block_index].pendingApply); assert(!cacheBlocks[block_index].pendingWB); assert(MSHR.find(block_index) != MSHR.end()); - pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items, - peerMemoryAtomSize); + std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize); for (int i = 0; i < numElementsPerLine; i++) { - DPRINTF(CoalesceEngine, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", - __func__, block_index, i, - cacheBlocks[block_index].items[i].to_string()); + DPRINTF(CoalesceEngine, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", + __func__, block_index, i, + cacheBlocks[block_index].items[i].to_string()); } cacheBlocks[block_index].valid = true; + cacheBlocks[block_index].needsWB |= do_wb; cacheBlocks[block_index].pendingData = false; cacheBlocks[block_index].lastChangedTick = curTick(); - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - delete pkt; + } else if (do_wb) { + PacketPtr wb_pkt = createWritePacket( + addr, peerMemoryAtomSize, (uint8_t*) items); + postPushWBQueue.emplace_back(wb_pkt, curTick()); + memoryFunctionQueue.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextPostPushWB(ignore, schedule_tick); + }, 0, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + } else { + DPRINTF(CoalesceEngine, "%s: Fuck 2.\n", __func__); } for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { @@ -570,6 +594,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) schedule(nextResponseEvent, nextCycle()); } + delete pkt; return true; } @@ -675,8 +700,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to " "applyQueue.\n", __func__, block_index); if ((!applyQueue.empty()) && - (!nextApplyEvent.scheduled())) { - schedule(nextApplyEvent, nextCycle()); + (!nextPreWBApplyEvent.scheduled())) { + schedule(nextPreWBApplyEvent, nextCycle()); } } else { assert(MSHR.size() <= numMSHREntries); @@ -742,7 +767,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) } void -CoalesceEngine::processNextApplyEvent() +CoalesceEngine::processNextPreWBApplyEvent() { int block_index = applyQueue.front(); DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. " @@ -757,27 +782,22 @@ CoalesceEngine::processNextApplyEvent() if (cacheBlocks[block_index].pendingApply) { assert(cacheBlocks[block_index].busyMask == 0); for (int index = 0; index < numElementsPerLine; index++) { - if (graphWorkload->applyCondition(cacheBlocks[block_index].items[index])) { - // TODO: Implement this function - bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]); - int bit_index_base = - getBitIndexBase(cacheBlocks[block_index].addr); - - if (do_push) { - if (needsPush[bit_index_base + index] == 0) { - _workCount++; - needsPush[bit_index_base + index] = 1; - activeBits.push_back(bit_index_base + index); - if (!owner->running()) { - owner->start(); - } + bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]); + if (do_push) { + int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr); + if (needsPush[bit_index_base + index] == 0) { + _workCount++; + needsPush[bit_index_base + index] = 1; + activeBits.push_back(bit_index_base + index); + if (!owner->running()) { + owner->start(); } } } } stats.bitvectorLength.sample(needsPush.count()); - cacheBlocks[block_index].needsWB = true; + assert(cacheBlocks[block_index].needsWB); cacheBlocks[block_index].needsApply = false; cacheBlocks[block_index].pendingApply = false; cacheBlocks[block_index].lastChangedTick = curTick(); @@ -810,8 +830,8 @@ CoalesceEngine::processNextApplyEvent() applyQueue.pop_front(); if ((!applyQueue.empty()) && - (!nextApplyEvent.scheduled())) { - schedule(nextApplyEvent, nextCycle()); + (!nextPreWBApplyEvent.scheduled())) { + schedule(nextPreWBApplyEvent, nextCycle()); } if (done()) { @@ -870,16 +890,78 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) assert(cacheBlocks[block_index].pendingData); assert(!cacheBlocks[block_index].pendingApply); assert(!cacheBlocks[block_index].pendingWB); - PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr, - peerMemoryAtomSize); - DPRINTF(CoalesceEngine, "%s: Created a read packet. addr = %lu, " - "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); - memPort.sendPacket(pkt); - onTheFlyReqs++; - - if (pendingVertexPullReads.find(pkt->getAddr()) != + + bool need_send_pkt = true; + for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++) + { + PacketPtr wb_pkt = std::get<0>(*wb); + if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) { + wb_pkt->writeDataToBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + cacheBlocks[block_index].needsWB = true; + for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { + Addr miss_addr = *it; + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + + if (aligned_miss_addr == cacheBlocks[block_index].addr) { + int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " + "cacheBlocks[%d] can be serviced with the received " + "packet.\n",__func__, miss_addr, block_index); + // TODO: Make this block of code into a function + responseQueue.push_back(std::make_tuple(miss_addr, + cacheBlocks[block_index].items[wl_offset], curTick())); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + cacheBlocks[block_index].items[wl_offset].to_string(), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + cacheBlocks[block_index].items[wl_offset].to_string(), + responseQueue.size()); + // TODO: Add a stat to count the number of WLItems that have been touched. + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + cacheBlocks[block_index].lastChangedTick = curTick(); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + it = MSHR[block_index].erase(it); + } else { + it++; + } + } + if (MSHR[block_index].empty()) { + MSHR.erase(block_index); + } + + if ((!nextResponseEvent.scheduled()) && + (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } + postPushWBQueue.erase(wb); + need_send_pkt = false; + } + } + + if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) != pendingVertexPullReads.end()) { - stats.numDoubleMemReads++; + need_send_pkt = false; + } + + if (need_send_pkt) { + PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr, + peerMemoryAtomSize); + DPRINTF(CoalesceEngine, "%s: Created a read packet. addr = %lu, " + "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); + memPort.sendPacket(pkt); + onTheFlyReqs++; + + if (pendingVertexPullReads.find(pkt->getAddr()) != + pendingVertexPullReads.end()) { + stats.numDoubleMemReads++; + } } } @@ -948,6 +1030,18 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) } } +void +CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick) +{ + PacketPtr wb_pkt; + Tick pkt_tick; + std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front(); + if (schedule_tick == pkt_tick) { + memPort.sendPacket(wb_pkt); + postPushWBQueue.pop_front(); + } +} + std::tuple CoalesceEngine::getOptimalPullAddr() { @@ -1017,6 +1111,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) assert(vertex_send_mask == 0); send_mask |= (1 << index_offset); pendingVertexPullReads[addr] = send_mask; + numPullsReceived--; } if (bit_status == BitStatus::IN_CACHE) { // renaming the outputs to their local names. @@ -1030,35 +1125,39 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) needsPush[slice_base_index + wl_offset] = 0; _workCount--; - // TODO: Implement a function like this. - // uint32_t delta, bool do_wb = prePushApply(cacheBlocks[block_index].items[wl_offset]); - // TODO: After implementing the above function get rid of this bool - // if (applyBeforePush) { - // cacheBlocks[block_index].items[wl_offset].prop = - // cacheBlocks[block_index].items[wl_offset].tempProp; - // } - // TODO: Implement recvVertexPush2 in PushEngine. - // owner->recvVertexPush2(vertex_addr, delta, - // cacheBlocks[block_index].items[wl_offset].edgeIndex, - // cacheBlocks[block_index].items[wl_offset].degree); - owner->recvVertexPush( - vertex_addr, cacheBlocks[block_index].items[wl_offset]); + uint32_t delta; + bool do_push, do_wb; + std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply( + cacheBlocks[block_index].items[wl_offset]); + cacheBlocks[block_index].needsWB |= do_wb; + if (do_push) { + owner->recvVertexPush(vertex_addr, delta, + cacheBlocks[block_index].items[wl_offset].edgeIndex, + cacheBlocks[block_index].items[wl_offset].degree); + } else { + DPRINTF(CoalesceEngine, "%s: Fuck!.\n", __func__); + owner->recvPrevPullCorrection(); + } stats.verticesPushed++; stats.lastVertexPushTime = curTick() - stats.lastResetTick; + numPullsReceived--; } if (bit_status == BitStatus::IN_MEMORY) { - Addr addr = location; - int index_offset = offset; - uint64_t send_mask = (1 << index_offset); - assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end()); - PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); - SenderState* sender_state = new SenderState(true); - pkt->pushSenderState(sender_state); - memPort.sendPacket(pkt); - onTheFlyReqs++; - pendingVertexPullReads[addr] = send_mask; + if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) { + Addr addr = location; + int index_offset = offset; + uint64_t send_mask = (1 << index_offset); + assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end()); + PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); + SenderState* sender_state = new SenderState(true); + pkt->pushSenderState(sender_state); + memPort.sendPacket(pkt); + onTheFlyReqs++; + maxPotentialPostPushWB++; + pendingVertexPullReads[addr] = send_mask; + numPullsReceived--; + } } - numPullsReceived--; } stats.bitvectorSearchStatus[bit_status]++; diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 0a2c0ca5ff..c0091a494d 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -123,14 +123,15 @@ class CoalesceEngine : public BaseMemoryEngine UniqueFIFO applyQueue; std::bitset needsPush; std::deque activeBits; - int postApplyWBQueueSize; - std::deque postApplyWBQueue; + int postPushWBQueueSize; + std::deque> postPushWBQueue; int getBlockIndex(Addr addr); int getBitIndexBase(Addr addr); Addr getBlockAddrFromBitIndex(int index); std::tuple getOptimalPullAddr(); + int maxPotentialPostPushWB; // A map from addr to sendMask. sendMask determines which bytes to // send for push when getting the read response from memory. std::unordered_map pendingVertexPullReads; @@ -140,14 +141,15 @@ class CoalesceEngine : public BaseMemoryEngine void processNextRead(int block_index, Tick schedule_tick); void processNextWriteBack(int block_index, Tick schedule_tick); void processNextVertexPull(int ignore, Tick schedule_tick); + void processNextPostPushWB(int ignore, Tick schedule_tick); std::deque, int, Tick>> memoryFunctionQueue; EventFunctionWrapper nextResponseEvent; void processNextResponseEvent(); - EventFunctionWrapper nextApplyEvent; - void processNextApplyEvent(); + EventFunctionWrapper nextPreWBApplyEvent; + void processNextPreWBApplyEvent(); struct CoalesceStats : public statistics::Group { diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc index 70f1e05f32..b91aa21a53 100644 --- a/src/accl/graph/sega/mpu.cc +++ b/src/accl/graph/sega/mpu.cc @@ -29,6 +29,7 @@ #include "accl/graph/sega/mpu.hh" #include "accl/graph/sega/centeral_controller.hh" +#include "debug/MPU.hh" #include "mem/packet_access.hh" #include "sim/sim_exit.hh" @@ -80,9 +81,16 @@ MPU::recvWorkload(GraphWorkload* workload) } void -MPU::recvVertexPush(Addr addr, WorkListItem wl) +MPU::recvVertexPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree) { - pushEngine->recvVertexPush(addr, wl); + pushEngine->recvVertexPush(addr, delta, edge_index, degree); +} + +void +MPU::recvPrevPullCorrection() +{ + DPRINTF(MPU, "%s: Fuck!\n", __func__); } void diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index 8f6101c325..8f3b29f603 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -75,9 +75,9 @@ class MPU : public SimObject void recvVertexPull() { return coalesceEngine->recvVertexPull(); } bool running() { return pushEngine->running(); } void start() { return pushEngine->start(); } - void recvVertexPush(Addr addr, WorkListItem wl); - void recvVertexPush2(Addr addr, uint32_t delta, + void recvVertexPush(Addr addr, uint32_t delta, uint32_t edge_index, uint32_t degree); + void recvPrevPullCorrection(); void recvDoneSignal(); bool done(); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index c54f19307f..c76567696e 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -184,18 +184,18 @@ PushEngine::processNextVertexPullEvent() } void -PushEngine::recvVertexPush(Addr addr, WorkListItem wl) +PushEngine::recvVertexPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree) { - assert(wl.degree > 0); + assert(degree > 0); assert((edgePointerQueueSize == 0) || ((edgePointerQueue.size() + numPendingPulls) <= edgePointerQueueSize)); - Addr start_addr = wl.edgeIndex * sizeof(Edge); - Addr end_addr = start_addr + (wl.degree * sizeof(Edge)); + Addr start_addr = edge_index * sizeof(Edge); + Addr end_addr = start_addr + (degree * sizeof(Edge)); + EdgeReadInfoGen info_gen(addr, delta, start_addr, end_addr, + sizeof(Edge), peerMemoryAtomSize); - // uint32_t value = calculateValue(wl); - EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge), - peerMemoryAtomSize, addr, wl.prop); edgePointerQueue.emplace_back(info_gen, curTick()); numPendingPulls--; @@ -207,6 +207,7 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl) (!nextMemoryReadEvent.scheduled())) { schedule(nextMemoryReadEvent, nextCycle()); } + } void @@ -229,7 +230,7 @@ PushEngine::processNextMemoryReadEvent() "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges); PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize); - PushInfo push_info = {curr_info.src(), curr_info.value(), offset, num_edges}; + PushInfo push_info = {curr_info.src(), curr_info.delta(), offset, num_edges}; reqInfoMap[pkt->req] = push_info; memPort.sendPacket(pkt); onTheFlyMemReqs += num_edges; diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 1112176897..848c93e313 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -66,21 +66,24 @@ class PushEngine : public BaseMemoryEngine class EdgeReadInfoGen { private: + Addr _src; + uint32_t _delta; + Addr _start; Addr _end; size_t _step; size_t _atom; - Addr _src; - uint32_t _value; - public: - EdgeReadInfoGen(Addr start, Addr end, size_t step, - size_t atom, Addr src, uint32_t value): - _start(start), _end(end), _step(step), - _atom(atom), _src(src), _value(value) + EdgeReadInfoGen(Addr src, uint32_t delta, Addr start, + Addr end, size_t step, size_t atom): + _src(src), _delta(delta), _start(start), + _end(end), _step(step), _atom(atom) {} + Addr src() { return _src; } + uint32_t delta() { return _delta; } + std::tuple nextReadPacketInfo() { panic_if(done(), "Should not call nextPacketInfo when done.\n"); @@ -105,9 +108,6 @@ class PushEngine : public BaseMemoryEngine } bool done() { return (_start >= _end); } - - Addr src() { return _src; } - uint32_t value() { return _value; } }; struct PushInfo { Addr src; @@ -197,8 +197,7 @@ class PushEngine : public BaseMemoryEngine void start(); bool running() { return _running; } - void recvVertexPush(Addr addr, WorkListItem wl); - void recvVertexPush2(Addr addr, uint32_t delta, + void recvVertexPush(Addr addr, uint32_t delta, uint32_t edge_index, uint32_t degree); void recvReqRetry(); From 932aec66eb6997d2be580eb711f299ee41d1559b Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 17 Oct 2022 08:40:47 -0700 Subject: [PATCH 197/247] Implementing correction function for PushEngine. --- src/accl/graph/sega/coalesce_engine.cc | 5 +++-- src/accl/graph/sega/mpu.cc | 2 +- src/accl/graph/sega/push_engine.cc | 9 +++++++++ src/accl/graph/sega/push_engine.hh | 1 + 4 files changed, 14 insertions(+), 3 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 0c223a8a5b..441457f2e8 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -501,9 +501,9 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) owner->recvVertexPush(vertex_addr, delta, items[i].edgeIndex, items[i].degree); } else { + // TODO: Add a stat to count this. owner->recvPrevPullCorrection(); } - stats.verticesPushed++; stats.lastVertexPushTime = curTick() - stats.lastResetTick; } @@ -548,7 +548,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) schedule(nextMemoryEvent, nextCycle()); } } else { - DPRINTF(CoalesceEngine, "%s: Fuck 2.\n", __func__); + // TODO: Add a stat to count this. + DPRINTF(CoalesceEngine, "%s: Totally wasteful read.\n", __func__); } for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc index b91aa21a53..b30060238d 100644 --- a/src/accl/graph/sega/mpu.cc +++ b/src/accl/graph/sega/mpu.cc @@ -90,7 +90,7 @@ MPU::recvVertexPush(Addr addr, uint32_t delta, void MPU::recvPrevPullCorrection() { - DPRINTF(MPU, "%s: Fuck!\n", __func__); + pushEngine->recvPrevPullCorrection(); } void diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index c76567696e..07f37a28dc 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -207,7 +207,16 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta, (!nextMemoryReadEvent.scheduled())) { schedule(nextMemoryReadEvent, nextCycle()); } +} +void +PushEngine::recvPrevPullCorrection() +{ + assert(numPendingPulls > 0); + numPendingPulls--; + if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) { + schedule(nextVertexPullEvent, nextCycle()); + } } void diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 848c93e313..2e1de25390 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -199,6 +199,7 @@ class PushEngine : public BaseMemoryEngine bool running() { return _running; } void recvVertexPush(Addr addr, uint32_t delta, uint32_t edge_index, uint32_t degree); + void recvPrevPullCorrection(); void recvReqRetry(); From 60ea8db3c1de4536d384c9b03e782db5739bf7b9 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Wed, 19 Oct 2022 08:03:16 -0700 Subject: [PATCH 198/247] Adding initialization to graphWorkloads --- configs/accl/sega-hbm.py | 4 +- src/accl/graph/base/data_structs.hh | 2 + src/accl/graph/base/graph_workload.cc | 72 ++++++++++++++++++++++ src/accl/graph/base/graph_workload.hh | 44 +++++++++++-- src/accl/graph/sega/centeral_controller.cc | 22 ++----- src/accl/graph/sega/centeral_controller.hh | 7 ++- src/accl/graph/sega/coalesce_engine.cc | 3 +- src/accl/graph/sega/coalesce_engine.hh | 2 +- 8 files changed, 128 insertions(+), 28 deletions(-) diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py index 50fd5f3069..9078c185f3 100644 --- a/configs/accl/sega-hbm.py +++ b/configs/accl/sega-hbm.py @@ -57,7 +57,7 @@ def __init__(self, edge_memory_size: str, cache_size: str): num_mshr_entry=64, num_tgts_per_mshr=64, max_resp_per_cycle=8, - post_apply_wb_queue_size=64 + post_push_wb_queue_size=64 ) self.push_engine = PushEngine( push_req_queue_size=32, @@ -136,7 +136,7 @@ def __init__(self, num_mpus, cache_size, graph_path): def create_initial_bfs_update(self, init_addr, init_value): self.ctrl.createInitialBFSUpdate(init_addr, init_value) - + def create_bfs_workload(self, init_addr, init_value): self.ctrl.createBFSWorkload(init_addr, init_value) diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index 2d81375b63..70babf5960 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -37,6 +37,8 @@ #include #include +#define MAX_BITVECTOR_SIZE (1 << 28) + namespace gem5 { diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 6a8e000515..542f2e0221 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -31,6 +31,37 @@ namespace gem5 { +BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size): + GraphWorkload(), initValue(init_value), atomSize(atom_size) +{ + initAddrBase = roundDown(init_addr, atomSize); + initIndex = (init_addr - initAddrBase) / atomSize; + numElementsPerLine = atomSize / sizeof(WorkListItem); +} + + +void +BFSWorkload::init(PacketPtr pkt, int bit_index_base, + std::bitset& needsPush, + std::deque& activeBits) +{ + if (pkt->getAddr() == initAddrBase) { + WorkListItem items[numElementsPerLine]; + + pkt->writeDataToBlock((uint8_t*) items, atomSize); + + items[initIndex].tempProp = initValue; + items[initIndex].prop = initValue; + needsPush[bit_index_base + initIndex] = 1; + activeBits.push_back(bit_index_base + initIndex); + + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, atomSize); + } + +} + uint32_t BFSWorkload::reduce(uint32_t update, uint32_t value) { @@ -68,4 +99,45 @@ BFSWorkload::prePushApply(WorkListItem& wl) return std::make_tuple(value, true, false); } + +uint32_t +PRWorkload::reduce(uint32_t update, uint32_t value) +{ + return update+value; +} + +uint32_t +PRWorkload::propagate(uint32_t value, uint32_t weight) +{ + return (alpha*value*weight); +} + +bool +PRWorkload::applyCondition(WorkListItem wl) +{ + return wl.tempProp != wl.prop; +} + +bool +PRWorkload::preWBApply(WorkListItem& wl) +{ + if (applyCondition(wl)) { + if (wl.degree > 0) { + return true; + } + } + return false; +} + +std::tuple +PRWorkload::prePushApply(WorkListItem& wl) +{ + uint32_t delta = abs(wl.prop - wl.tempProp)/wl.degree; + if (delta > threshold) { + return std::make_tuple(delta, true, true); + } + uint32_t value = wl.tempProp; + return std::make_tuple(value, false, false); +} + } // namespace gem5 diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh index c4db5c9e2f..cc0767305a 100644 --- a/src/accl/graph/base/graph_workload.hh +++ b/src/accl/graph/base/graph_workload.hh @@ -29,9 +29,13 @@ #ifndef __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__ #define __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__ +#include +#include #include #include "accl/graph/base/data_structs.hh" +#include "base/intmath.hh" +#include "mem/packet.hh" namespace gem5 @@ -42,6 +46,10 @@ class GraphWorkload public: GraphWorkload() {} ~GraphWorkload() {} + + virtual void init(PacketPtr pkt, int bit_index_base, + std::bitset& needsPush, + std::deque& activeBits) = 0; virtual uint32_t reduce(uint32_t update, uint32_t value) = 0; virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0; virtual bool applyCondition(WorkListItem wl) = 0; @@ -52,16 +60,42 @@ class GraphWorkload class BFSWorkload : public GraphWorkload { private: - uint64_t initAddr; + uint64_t initAddrBase; + int initIndex; uint32_t initValue; + int numElementsPerLine; + int atomSize; public: - BFSWorkload(uint64_t init_addr, uint32_t init_value): - GraphWorkload(), - initAddr(init_addr), initValue(init_value) - {} + BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size); ~BFSWorkload() {} + virtual void init(PacketPtr pkt, int bit_index_base, + std::bitset& needsPush, + std::deque& activeBits); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual bool applyCondition(WorkListItem wl); + virtual bool preWBApply(WorkListItem& wl); + virtual std::tuple prePushApply(WorkListItem& wl); +}; + + +class PRWorkload : public GraphWorkload +{ + private: + float alpha; + float threshold; + public: + PRWorkload(float alpha, float threshold): + GraphWorkload(), alpha(alpha), threshold(threshold) + {} + + ~PRWorkload() {} + + virtual void init(PacketPtr pkt, int bit_index_base, + std::bitset& needsPush, + std::deque& activeBits); virtual uint32_t reduce(uint32_t update, uint32_t value); virtual uint32_t propagate(uint32_t value, uint32_t weight); virtual bool applyCondition(WorkListItem wl); diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index fd282834e9..dbd1705e8a 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -51,12 +51,13 @@ CenteralController::CenteralController(const Params& params): } void -CenteralController::initState() +CenteralController::startup() { for (auto mpu: mpuVector) { addrRangeListMap[mpu] = mpu->getAddrRanges(); mpu->recvWorkload(workload); } + const auto& file = params().image_file; if (file == "") return; @@ -79,22 +80,11 @@ CenteralController::initState() }, system->cacheLineSize()); panic_if(!image.write(proxy), "%s: Unable to write image."); -} -void -CenteralController::startup() -{ - while(!initialUpdates.empty()) { - PacketPtr front = initialUpdates.front(); - for (auto mpu: mpuVector) { - AddrRangeList range_list = addrRangeListMap[mpu]; - for (auto range: range_list) { - if (range.contains(front->getAddr())) { - mpu->handleIncomingUpdate(front); - } - } + for (auto mpu: mpuVector) { + if (!mpu->running() && (mpu->workCount ()> 0)) { + mpu->start(); } - initialUpdates.pop_front(); } } @@ -140,7 +130,7 @@ CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value) void CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value) { - workload = new BFSWorkload(init_addr, init_value); + workload = new BFSWorkload(init_addr, init_value, system->cacheLineSize()); } void diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index 1f1df00b4b..4c5ff28ebe 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -48,8 +48,6 @@ class CenteralController : public ClockedObject private: System* system; - GraphWorkload* workload; - Addr maxVertexAddr; std::deque initialUpdates; @@ -60,10 +58,13 @@ class CenteralController : public ClockedObject template PacketPtr createUpdatePacket(Addr addr, T value); public: + + GraphWorkload* workload; + PARAMS(CenteralController); CenteralController(const CenteralControllerParams ¶ms); - virtual void initState() override; + // virtual void initState() override; virtual void startup() override; void createInitialBFSUpdate(Addr init_addr, uint32_t init_value); diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 441457f2e8..b91b92c0fb 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -121,7 +121,8 @@ CoalesceEngine::recvFunctional(PacketPtr pkt) } } else { // TODO: Add and implement init function for GraphWorkload. - // graphWorkload->init(pkt); + int bit_index_base = getBitIndexBase(pkt->getAddr()); + graphWorkload->init(pkt, bit_index_base, needsPush, activeBits); memPort.sendFunctional(pkt); } } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index c0091a494d..926caf46db 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -38,7 +38,7 @@ #include "base/statistics.hh" #include "params/CoalesceEngine.hh" -#define MAX_BITVECTOR_SIZE (1 << 28) + namespace gem5 { From 9b91fb71245587cfbd95e11bab0d767e571d69f3 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Sat, 22 Oct 2022 12:36:32 -0700 Subject: [PATCH 199/247] Fixing algo start issue. --- src/accl/graph/sega/centeral_controller.cc | 2 +- src/accl/graph/sega/coalesce_engine.cc | 2 +- src/accl/graph/sega/coalesce_engine.hh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index dbd1705e8a..61ad7c10b4 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -82,7 +82,7 @@ CenteralController::startup() panic_if(!image.write(proxy), "%s: Unable to write image."); for (auto mpu: mpuVector) { - if (!mpu->running() && (mpu->workCount ()> 0)) { + if (!mpu->running() && (mpu->workCount()> 0)) { mpu->start(); } } diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index b91b92c0fb..72ceba6f89 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -1079,7 +1079,7 @@ CoalesceEngine::getOptimalPullAddr() return std::make_tuple( BitStatus::IN_CACHE, block_index, index_offset); // Otherwise if it is in memory - } else if (cacheBlocks[block_index].addr != addr) { + } else if ((!cacheBlocks[block_index].valid) || (cacheBlocks[block_index].addr != addr)) { activeBits.pop_front(); return std::make_tuple( BitStatus::IN_MEMORY, addr, index_offset); diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 926caf46db..8c187f8fb8 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -209,7 +209,7 @@ class CoalesceEngine : public BaseMemoryEngine bool recvWLRead(Addr addr); void recvWLWrite(Addr addr, WorkListItem wl); - int workCount() { return _workCount; } + int workCount() { return needsPush.count(); } void recvVertexPull(); bool done(); From d4644cea189cf0deb4b7714018b2a14153c10d7b Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 22 Oct 2022 13:49:41 -0700 Subject: [PATCH 200/247] Fixing block addr initialization. --- src/accl/graph/sega/coalesce_engine.cc | 2 +- src/accl/graph/sega/coalesce_engine.hh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 72ceba6f89..5b5374873c 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -267,7 +267,7 @@ CoalesceEngine::recvWLRead(Addr addr) // is cold and addr or aligned_addr is 0. It fails because cache block // addr field is initialized to 0. Unfortunately Addr type is unsigned. // So you can not initialized addr to -1. - // assert(cacheBlocks[block_index].addr != aligned_addr); + assert(cacheBlocks[block_index].addr != aligned_addr); assert(MSHR.size() <= numMSHREntries); DPRINTF(CoalesceEngine, "%s: Addr: %lu is a miss.\n", __func__, addr); if (MSHR.find(block_index) == MSHR.end()) { diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 8c187f8fb8..e710553be1 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -73,7 +73,7 @@ class CoalesceEngine : public BaseMemoryEngine // Tick lastWLWriteTick; Block() {} Block(int num_elements): - addr(0), + addr(-1), busyMask(0), valid(false), needsApply(false), From e2f68af811ad9a16c5d84aa678d1baf2208f9fe1 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 23 Oct 2022 21:43:33 -0700 Subject: [PATCH 201/247] Adding PR. --- src/accl/graph/base/graph_workload.cc | 48 ++++++++++++++++++---- src/accl/graph/base/graph_workload.hh | 15 ++++--- src/accl/graph/sega/CenteralController.py | 3 +- src/accl/graph/sega/centeral_controller.cc | 32 +-------------- src/accl/graph/sega/centeral_controller.hh | 8 +--- src/accl/graph/sega/coalesce_engine.cc | 27 ++---------- src/accl/graph/sega/coalesce_engine.hh | 3 -- 7 files changed, 57 insertions(+), 79 deletions(-) diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 542f2e0221..cbaef86a76 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -36,13 +36,13 @@ BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size) { initAddrBase = roundDown(init_addr, atomSize); initIndex = (init_addr - initAddrBase) / atomSize; - numElementsPerLine = atomSize / sizeof(WorkListItem); + numElementsPerLine = (int) (atomSize / sizeof(WorkListItem)); } void BFSWorkload::init(PacketPtr pkt, int bit_index_base, - std::bitset& needsPush, + std::bitset& needsPush, std::deque& activeBits) { if (pkt->getAddr() == initAddrBase) { @@ -99,23 +99,53 @@ BFSWorkload::prePushApply(WorkListItem& wl) return std::make_tuple(value, true, false); } +PRWorkload::PRWorkload(float alpha, float threshold, int atom_size): + GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size) +{ + numElementsPerLine = (int) (atomSize / sizeof(WorkListItem)); +} + +void +PRWorkload::init(PacketPtr pkt, int bit_index_base, + std::bitset& needsPush, + std::deque& activeBits) +{ + WorkListItem items[numElementsPerLine]; + + pkt->writeDataToBlock((uint8_t*) items, atomSize); + for (int i = 0; i < numElementsPerLine; i++) { + items[i].tempProp = readFromFloat(0); + items[i].prop = readFromFloat(1 - alpha); + needsPush[bit_index_base + i] = 1; + activeBits.push_back(bit_index_base + i); + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, atomSize); +} uint32_t PRWorkload::reduce(uint32_t update, uint32_t value) { - return update+value; + float update_float = writeToFloat(update); + float value_float = writeToFloat(value); + return readFromFloat(update_float + value_float); } uint32_t PRWorkload::propagate(uint32_t value, uint32_t weight) { - return (alpha*value*weight); + float value_float = writeToFloat(value); + float weight_float = writeToFloat(weight); + return readFromFloat(alpha * value_float * weight_float); } bool PRWorkload::applyCondition(WorkListItem wl) { - return wl.tempProp != wl.prop; + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + return temp_float != prop_float; } bool @@ -132,12 +162,14 @@ PRWorkload::preWBApply(WorkListItem& wl) std::tuple PRWorkload::prePushApply(WorkListItem& wl) { - uint32_t delta = abs(wl.prop - wl.tempProp)/wl.degree; + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + float delta = abs((temp_float - prop_float) / wl.degree); if (delta > threshold) { + wl.prop = wl.tempProp; return std::make_tuple(delta, true, true); } - uint32_t value = wl.tempProp; - return std::make_tuple(value, false, false); + return std::make_tuple(0, false, false); } } // namespace gem5 diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh index cc0767305a..831da97e71 100644 --- a/src/accl/graph/base/graph_workload.hh +++ b/src/accl/graph/base/graph_workload.hh @@ -48,7 +48,7 @@ class GraphWorkload ~GraphWorkload() {} virtual void init(PacketPtr pkt, int bit_index_base, - std::bitset& needsPush, + std::bitset& needsPush, std::deque& activeBits) = 0; virtual uint32_t reduce(uint32_t update, uint32_t value) = 0; virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0; @@ -65,13 +65,14 @@ class BFSWorkload : public GraphWorkload uint32_t initValue; int numElementsPerLine; int atomSize; + public: BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size); ~BFSWorkload() {} virtual void init(PacketPtr pkt, int bit_index_base, - std::bitset& needsPush, + std::bitset& needsPush, std::deque& activeBits); virtual uint32_t reduce(uint32_t update, uint32_t value); virtual uint32_t propagate(uint32_t value, uint32_t weight); @@ -86,15 +87,17 @@ class PRWorkload : public GraphWorkload private: float alpha; float threshold; + + int numElementsPerLine; + int atomSize; + public: - PRWorkload(float alpha, float threshold): - GraphWorkload(), alpha(alpha), threshold(threshold) - {} + PRWorkload(float alpha, float threshold, int atom_size); ~PRWorkload() {} virtual void init(PacketPtr pkt, int bit_index_base, - std::bitset& needsPush, + std::bitset& needsPush, std::deque& activeBits); virtual uint32_t reduce(uint32_t update, uint32_t value); virtual uint32_t propagate(uint32_t value, uint32_t weight); diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index 17badf9ec4..09a997696d 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -42,8 +42,7 @@ class CenteralController(ClockedObject): mpu_vector = VectorParam.MPU("All mpus in the system.") cxx_exports = [ - PyBindMethod("createInitialBFSUpdate"), PyBindMethod("createBFSWorkload"), - PyBindMethod("createInitialPRUpdate"), + PyBindMethod("createPRWorkload"), PyBindMethod("printAnswerToHostSimout") ] diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 61ad7c10b4..57198450d4 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -103,30 +103,6 @@ CenteralController::createReadPacket(Addr addr, unsigned int size) return pkt; } -template PacketPtr -CenteralController::createUpdatePacket(Addr addr, T value) -{ - RequestPtr req = std::make_shared(addr, sizeof(T), addr, value); - // Dummy PC to have PC-based prefetchers latch on; get entropy into higher - // bits - req->setPC(((Addr) value) << 2); - - PacketPtr pkt = new Packet(req, MemCmd::UpdateWL); - - pkt->allocate(); - - pkt->setLE(value); - - return pkt; -} - -void -CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value) -{ - PacketPtr update = createUpdatePacket(init_addr, init_value); - initialUpdates.push_back(update); -} - void CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value) { @@ -134,13 +110,9 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value) } void -CenteralController::createInitialPRUpdate() +CenteralController::createPRWorkload(float alpha, float threshold) { - for (auto mpu: mpuVector) { - if (!mpu->running() && (mpu->workCount() > 0)) { - mpu->start(); - } - } + workload = new PRWorkload(alpha, threshold, system->cacheLineSize()); } void diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index 4c5ff28ebe..9ddb1b35f0 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -49,13 +49,11 @@ class CenteralController : public ClockedObject System* system; Addr maxVertexAddr; - std::deque initialUpdates; std::vector mpuVector; std::unordered_map addrRangeListMap; PacketPtr createReadPacket(Addr addr, unsigned int size); - template PacketPtr createUpdatePacket(Addr addr, T value); public: @@ -63,13 +61,11 @@ class CenteralController : public ClockedObject PARAMS(CenteralController); CenteralController(const CenteralControllerParams ¶ms); - - // virtual void initState() override; virtual void startup() override; - void createInitialBFSUpdate(Addr init_addr, uint32_t init_value); void createBFSWorkload(Addr init_addr, uint32_t init_value); - void createInitialPRUpdate(); + void createPRWorkload(float alpha, float threshold); + void recvDoneSignal(); void printAnswerToHostSimout(); diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 5b5374873c..e71cc1195f 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -48,8 +48,7 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), maxRespPerCycle(params.max_resp_per_cycle), - _workCount(0), numPullsReceived(0), - postPushWBQueueSize(params.post_push_wb_queue_size), + numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size), maxPotentialPostPushWB(0), nextMemoryEvent([this] { processNextMemoryEvent(); @@ -76,25 +75,6 @@ CoalesceEngine::registerMPU(MPU* mpu) owner = mpu; } -// void -// CoalesceEngine::algoInit(PacketPtr pkt) -// { -// WorkListItem items[numElementsPerLine]; - -// if(workload == "PR") { -// //TODO: Add Alpha -// pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); -// int bit_index_base = getBitIndexBase(pkt->getAddr()); -// for (int i = 0; i < numElementsPerLine; i++) { -// items[i].tempProp = readFromFloat(0); -// items[i].prop = readFromFloat(1 - 0.2); -// needsPush[bit_index_base + i] = 1; -// activeBits.push_back(bit_index_base + i); -// } -// pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize); -// } -// } - void CoalesceEngine::recvFunctional(PacketPtr pkt) { @@ -491,7 +471,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) if (vertex_send_mask != 0) { assert(needsPush[it + i] == 1); needsPush[it + i] = 0; - _workCount--; uint32_t delta; bool do_push, do_wb_v; @@ -550,6 +529,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) } } else { // TODO: Add a stat to count this. + // FIXME: This is not a totally wasteful read. e.g. all reads + // for pull in BFS are like this. DPRINTF(CoalesceEngine, "%s: Totally wasteful read.\n", __func__); } @@ -788,7 +769,6 @@ CoalesceEngine::processNextPreWBApplyEvent() if (do_push) { int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr); if (needsPush[bit_index_base + index] == 0) { - _workCount++; needsPush[bit_index_base + index] = 1; activeBits.push_back(bit_index_base + index); if (!owner->running()) { @@ -1125,7 +1105,6 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) int slice_base_index = getBitIndexBase(addr); needsPush[slice_base_index + wl_offset] = 0; - _workCount--; uint32_t delta; bool do_push, do_wb; diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index e710553be1..c8fec38e5b 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -116,9 +116,6 @@ class CoalesceEngine : public BaseMemoryEngine int maxRespPerCycle; std::deque> responseQueue; - bool applyBeforeWB; - bool applyBeforePush; - int _workCount; int numPullsReceived; UniqueFIFO applyQueue; std::bitset needsPush; From bb31571e3cab67431ddbd146174997e87716b00b Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 23 Oct 2022 22:14:05 -0700 Subject: [PATCH 202/247] Prepping for PR. --- configs/accl/sega-hbm.py | 10 +++++----- src/accl/graph/sega/coalesce_engine.cc | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py index 9078c185f3..1c9276f0a0 100644 --- a/configs/accl/sega-hbm.py +++ b/configs/accl/sega-hbm.py @@ -134,12 +134,12 @@ def __init__(self, num_mpus, cache_size, graph_path): self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] - def create_initial_bfs_update(self, init_addr, init_value): - self.ctrl.createInitialBFSUpdate(init_addr, init_value) - def create_bfs_workload(self, init_addr, init_value): self.ctrl.createBFSWorkload(init_addr, init_value) + def create_pr_workload(self, alpha, threshold): + self.ctrl.createPRWorkload(alpha, threshold) + def print_answer(self): self.ctrl.printAnswerToHostSimout() @@ -169,8 +169,8 @@ def get_inputs(): m5.instantiate() - system.create_initial_bfs_update(init_addr, init_value) - system.create_bfs_workload(init_addr, init_value) + # system.create_bfs_workload(init_addr, init_value) + system.create_pr_workload(0.2, 0.0000001) exit_event = m5.simulate() print(f"Exited simulation at tick {m5.curTick()} " + \ f"because {exit_event.getCause()}") diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index e71cc1195f..2d5445093a 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -577,6 +577,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) schedule(nextResponseEvent, nextCycle()); } + + // TODO: Probably check for done here too. delete pkt; return true; } From 9c1f57e6d82ebbf5d3dd7b23e8a5cb0912fb04b4 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 25 Oct 2022 13:52:56 -0700 Subject: [PATCH 203/247] Adding print function to GraphWorkload class. --- src/accl/graph/base/data_structs.hh | 21 ----------- src/accl/graph/base/graph_workload.cc | 44 ++++++++++++++++++++++ src/accl/graph/base/graph_workload.hh | 4 +- src/accl/graph/sega/centeral_controller.cc | 4 +- 4 files changed, 49 insertions(+), 24 deletions(-) diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index 70babf5960..d9028e2f10 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -34,7 +34,6 @@ #include #include -#include #include #define MAX_BITVECTOR_SIZE (1 << 28) @@ -181,26 +180,6 @@ class UniqueFIFO } }; -template -float -writeToFloat(T value) -{ - assert(sizeof(T) == sizeof(float)); - float float_form; - std::memcpy(&float_form, &value, sizeof(float)); - return float_form; -} - -template -T -readFromFloat(float value) -{ - assert(sizeof(T) == sizeof(float)); - T float_bits; - std::memcpy(&float_bits, &value, sizeof(float)); - return float_bits; -} - } #endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__ diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index cbaef86a76..ead32c0eb8 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -28,9 +28,34 @@ #include "accl/graph/base/graph_workload.hh" +#include + +#include "base/cprintf.hh" +#include "base/intmath.hh" + namespace gem5 { +template +float +writeToFloat(T value) +{ + assert(sizeof(T) == sizeof(float)); + float float_form; + std::memcpy(&float_form, &value, sizeof(float)); + return float_form; +} + +template +T +readFromFloat(float value) +{ + assert(sizeof(T) == sizeof(float)); + T float_bits; + std::memcpy(&float_bits, &value, sizeof(float)); + return float_bits; +} + BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size): GraphWorkload(), initValue(init_value), atomSize(atom_size) { @@ -99,6 +124,15 @@ BFSWorkload::prePushApply(WorkListItem& wl) return std::make_tuple(value, true, false); } +std::string +BFSWorkload::printWorkListItem(const WorkListItem wl) +{ + return csprintf( + "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}", + wl.tempProp, wl.prop, wl.degree, wl.edgeIndex + ); +} + PRWorkload::PRWorkload(float alpha, float threshold, int atom_size): GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size) { @@ -172,4 +206,14 @@ PRWorkload::prePushApply(WorkListItem& wl) return std::make_tuple(0, false, false); } +std::string +PRWorkload::printWorkListItem(const WorkListItem wl) +{ + float temp_float = writeToFloat(wl.tempProp); + return csprintf( + "WorkListItem{tempProp: %f, prop: %u, degree: %u, edgeIndex: %u}", + temp_float, temp_float, wl.degree, wl.edgeIndex + ); +} + } // namespace gem5 diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh index 831da97e71..c391a80c23 100644 --- a/src/accl/graph/base/graph_workload.hh +++ b/src/accl/graph/base/graph_workload.hh @@ -34,7 +34,6 @@ #include #include "accl/graph/base/data_structs.hh" -#include "base/intmath.hh" #include "mem/packet.hh" @@ -55,6 +54,7 @@ class GraphWorkload virtual bool applyCondition(WorkListItem wl) = 0; virtual bool preWBApply(WorkListItem& wl) = 0; virtual std::tuple prePushApply(WorkListItem& wl) = 0; + virtual std::string printWorkListItem(const WorkListItem wl) = 0; }; class BFSWorkload : public GraphWorkload @@ -79,6 +79,7 @@ class BFSWorkload : public GraphWorkload virtual bool applyCondition(WorkListItem wl); virtual bool preWBApply(WorkListItem& wl); virtual std::tuple prePushApply(WorkListItem& wl); + virtual std::string printWorkListItem(const WorkListItem wl); }; @@ -104,6 +105,7 @@ class PRWorkload : public GraphWorkload virtual bool applyCondition(WorkListItem wl); virtual bool preWBApply(WorkListItem& wl); virtual std::tuple prePushApply(WorkListItem& wl); + virtual std::string printWorkListItem(const WorkListItem wl); }; } diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 57198450d4..fc2262e111 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -144,8 +144,8 @@ CenteralController::printAnswerToHostSimout() } pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize()); for (int i = 0; i < num_items; i++) { - std::string print = csprintf("WorkListItem[%lu][%d]: %s.", - addr, i, items[i].to_string()); + std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i, + workload->printWorkListItem(items[i])); std::cout << print << std::endl; } From 95c676bd0ec2ddacf512945b4de454bd91f52f6c Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Tue, 25 Oct 2022 16:48:11 -0700 Subject: [PATCH 204/247] Updating PR --- src/accl/graph/base/graph_workload.cc | 36 +++++++++-------- src/accl/graph/sega/coalesce_engine.cc | 53 ++++++++++++++++---------- src/accl/graph/sega/wl_engine.cc | 10 ++--- 3 files changed, 58 insertions(+), 41 deletions(-) diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index ead32c0eb8..9f7e5fc4c5 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -77,8 +77,10 @@ BFSWorkload::init(PacketPtr pkt, int bit_index_base, items[initIndex].tempProp = initValue; items[initIndex].prop = initValue; - needsPush[bit_index_base + initIndex] = 1; - activeBits.push_back(bit_index_base + initIndex); + if (items[initIndex].degree > 0) { + needsPush[bit_index_base + initIndex] = 1; + activeBits.push_back(bit_index_base + initIndex); + } pkt->deleteData(); pkt->allocate(); @@ -150,8 +152,10 @@ PRWorkload::init(PacketPtr pkt, int bit_index_base, for (int i = 0; i < numElementsPerLine; i++) { items[i].tempProp = readFromFloat(0); items[i].prop = readFromFloat(1 - alpha); - needsPush[bit_index_base + i] = 1; - activeBits.push_back(bit_index_base + i); + if (items[i].degree > 0) { + needsPush[bit_index_base + i] = 1; + activeBits.push_back(bit_index_base + i); + } } pkt->deleteData(); pkt->allocate(); @@ -170,7 +174,7 @@ uint32_t PRWorkload::propagate(uint32_t value, uint32_t weight) { float value_float = writeToFloat(value); - float weight_float = writeToFloat(weight); + float weight_float = writeToFloat(1); return readFromFloat(alpha * value_float * weight_float); } @@ -179,27 +183,27 @@ PRWorkload::applyCondition(WorkListItem wl) { float temp_float = writeToFloat(wl.tempProp); float prop_float = writeToFloat(wl.prop); - return temp_float != prop_float; + float dist = std::abs(temp_float - prop_float); + return dist >= threshold; } bool PRWorkload::preWBApply(WorkListItem& wl) { - if (applyCondition(wl)) { - if (wl.degree > 0) { - return true; - } + if (applyCondition(wl) && (wl.degree > 0)) { + return true; } return false; } std::tuple PRWorkload::prePushApply(WorkListItem& wl) -{ - float temp_float = writeToFloat(wl.tempProp); - float prop_float = writeToFloat(wl.prop); - float delta = abs((temp_float - prop_float) / wl.degree); - if (delta > threshold) { +{ + if (applyCondition(wl)) { + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + float delta = (temp_float - prop_float) / wl.degree; + std::cout << "PRWorkload: delta: " << delta << std::endl; wl.prop = wl.tempProp; return std::make_tuple(delta, true, true); } @@ -211,7 +215,7 @@ PRWorkload::printWorkListItem(const WorkListItem wl) { float temp_float = writeToFloat(wl.tempProp); return csprintf( - "WorkListItem{tempProp: %f, prop: %u, degree: %u, edgeIndex: %u}", + "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}", temp_float, temp_float, wl.degree, wl.edgeIndex ); } diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 2d5445093a..0d1eecf43f 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -157,7 +157,7 @@ CoalesceEngine::recvWLRead(Addr addr) "%lu, and wl_offset: %d.\n", __func__, addr, block_index, aligned_addr, wl_offset); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); + block_index, cacheBlocks[block_index].to_string()); if ((cacheBlocks[block_index].addr == aligned_addr) && (cacheBlocks[block_index].valid)) { @@ -176,15 +176,17 @@ CoalesceEngine::recvWLRead(Addr addr) addr, cacheBlocks[block_index].items[wl_offset], curTick())); DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " - "to responseQueue. responseQueue.size = %d.\n", - __func__, addr, - cacheBlocks[block_index].items[wl_offset].to_string(), - responseQueue.size()); + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " - "to responseQueue. responseQueue.size = %d.\n", - __func__, addr, - cacheBlocks[block_index].items[wl_offset].to_string(), - responseQueue.size()); + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); // TODO: Stat to count the number of WLItems that have been touched. cacheBlocks[block_index].busyMask |= (1 << wl_offset); // If they are scheduled for apply and WB those schedules should be @@ -476,6 +478,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) bool do_push, do_wb_v; std::tie(delta, do_push, do_wb_v) = graphWorkload->prePushApply(items[i]); + std::cout << "CoalesceEngine: delta: " << delta << std::endl; do_wb |= do_wb_v; if (do_push) { owner->recvVertexPush(vertex_addr, delta, @@ -508,8 +511,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize); for (int i = 0; i < numElementsPerLine; i++) { DPRINTF(CoalesceEngine, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", - __func__, block_index, i, - cacheBlocks[block_index].items[i].to_string()); + __func__, block_index, i, graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[i])); } cacheBlocks[block_index].valid = true; cacheBlocks[block_index].needsWB |= do_wb; @@ -550,12 +553,14 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " "to responseQueue. responseQueue.size = %d.\n", __func__, miss_addr, - cacheBlocks[block_index].items[wl_offset].to_string(), + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), responseQueue.size()); DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " "to responseQueue. responseQueue.size = %d.\n", __func__, addr, - cacheBlocks[block_index].items[wl_offset].to_string(), + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), responseQueue.size()); // TODO: Add a stat to count the number of WLItems that have been touched. cacheBlocks[block_index].busyMask |= (1 << wl_offset); @@ -603,7 +608,9 @@ CoalesceEngine::processNextResponseEvent() num_responses_sent++; DPRINTF(CoalesceEngine, "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n", - __func__, worklist_response.to_string(), addr_response); + __func__, + graphWorkload->printWorkListItem(worklist_response), + addr_response); responseQueue.pop_front(); DPRINTF(SEGAStructureSize, "%s: Popped a response from responseQueue. " @@ -640,12 +647,13 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) DPRINTF(CoalesceEngine, "%s: Received a write request for addr: %lu with " "wl: %s. This request maps to cacheBlocks[%d], " "aligned_addr: %lu, and wl_offset: %d.\n", - __func__, addr, wl.to_string(), + __func__, addr, graphWorkload->printWorkListItem(wl), block_index, aligned_addr, wl_offset); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); DPRINTF(CoalesceEngine, "%s: Received a write for WorkListItem: %s " - "with Addr: %lu.\n", __func__, wl.to_string(), addr); + "with Addr: %lu.\n", __func__, + graphWorkload->printWorkListItem(wl), addr); // Desing does not allow for write misses for now. assert(cacheBlocks[block_index].addr == aligned_addr); // cache state asserts @@ -666,13 +674,15 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) cacheBlocks[block_index].items[wl_offset] = wl; if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) { cacheBlocks[block_index].needsApply |= true; + cacheBlocks[block_index].needsWB |= true; } cacheBlocks[block_index].busyMask &= ~(1 << wl_offset); cacheBlocks[block_index].lastChangedTick = curTick(); DPRINTF(CoalesceEngine, "%s: Wrote to cacheBlocks[%d][%d] = %s.\n", __func__, block_index, wl_offset, - cacheBlocks[block_index].items[wl_offset].to_string()); + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset])); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); @@ -899,12 +909,14 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " "to responseQueue. responseQueue.size = %d.\n", __func__, miss_addr, - cacheBlocks[block_index].items[wl_offset].to_string(), + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), responseQueue.size()); DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " "to responseQueue. responseQueue.size = %d.\n", __func__, miss_addr, - cacheBlocks[block_index].items[wl_offset].to_string(), + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), responseQueue.size()); // TODO: Add a stat to count the number of WLItems that have been touched. cacheBlocks[block_index].busyMask |= (1 << wl_offset); @@ -1061,7 +1073,7 @@ CoalesceEngine::getOptimalPullAddr() return std::make_tuple( BitStatus::IN_CACHE, block_index, index_offset); // Otherwise if it is in memory - } else if ((!cacheBlocks[block_index].valid) || (cacheBlocks[block_index].addr != addr)) { + } else if ((cacheBlocks[block_index].addr != addr)) { activeBits.pop_front(); return std::make_tuple( BitStatus::IN_MEMORY, addr, index_offset); @@ -1112,6 +1124,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) bool do_push, do_wb; std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply( cacheBlocks[block_index].items[wl_offset]); + std::cout << "CoalesceEngine: delta: " << delta << std::endl; cacheBlocks[block_index].needsWB |= do_wb; if (do_push) { owner->recvVertexPush(vertex_addr, delta, diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 85fe9be2ca..a698f2cc0a 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -263,10 +263,10 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl) workListFile[addr] = wl; DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) to " "workListFile. workListFile.size = %d.\n", __func__, addr, - wl.to_string(), workListFile.size()); + graphWorkload->printWorkListItem(wl), workListFile.size()); DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to " "workListFile. workListFile.size = %d.\n", __func__, addr, - wl.to_string(), workListFile.size()); + graphWorkload->printWorkListItem(wl), workListFile.size()); stats.vertexReadLatency.sample( ((curTick() - vertexReadTime[addr]) * 1e9) / getClockFrequency()); @@ -287,13 +287,13 @@ WLEngine::processNextReduceEvent() uint32_t update_value = registerFile[addr]; DPRINTF(WLEngine, "%s: Reducing between registerFile and workListFile" ". registerFile[%lu] = %u, workListFile[%lu] = %s.\n", - __func__, addr, registerFile[addr], - addr, workListFile[addr].to_string()); + __func__, addr, registerFile[addr], addr, + graphWorkload->printWorkListItem(workListFile[addr])); // TODO: Generalize this to reduce function rather than just min workListFile[addr].tempProp = graphWorkload->reduce(update_value, workListFile[addr].tempProp); DPRINTF(WLEngine, "%s: Reduction done. workListFile[%lu] = %s.\n", - __func__, addr, workListFile[addr].to_string()); + __func__, addr, graphWorkload->printWorkListItem(workListFile[addr])); stats.numReduce++; owner->recvWLWrite(addr, workListFile[addr]); From 166c3ac21df0a8175334dc8c426309e603d81b03 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 26 Oct 2022 07:11:05 -0700 Subject: [PATCH 205/247] Updating configs for pr and bfs. Fixing bugs for pr. --- configs/accl/bfs.py | 78 +++++++++++ configs/accl/pr.py | 78 +++++++++++ configs/accl/real-graph-gen.py | 41 ++++-- configs/accl/sega-hbm.py | 178 ------------------------- configs/accl/sega.py | 137 +++++++++---------- configs/accl/synth-graph-gen.py | 88 ++++++++---- src/accl/graph/base/graph_workload.cc | 10 +- src/accl/graph/sega/coalesce_engine.cc | 24 ++-- 8 files changed, 332 insertions(+), 302 deletions(-) create mode 100644 configs/accl/bfs.py create mode 100644 configs/accl/pr.py delete mode 100644 configs/accl/sega-hbm.py diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py new file mode 100644 index 0000000000..d02faa96ca --- /dev/null +++ b/configs/accl/bfs.py @@ -0,0 +1,78 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from sega import SEGA + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("init_addr", type=float) + argparser.add_argument("init_value", type=float) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.cache_size, + args.graph, + args.alpha, + args.threshold, + args.verify, + ) + + +if __name__ == "__m5_main__": + num_gpts, cache_size, graph, init_addr, init_value, verify = get_inputs() + + system = SEGA(num_gpts, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.create_bfs_workload(init_addr, init_value) + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if verify: + system.print_answer() diff --git a/configs/accl/pr.py b/configs/accl/pr.py new file mode 100644 index 0000000000..59e8b924c6 --- /dev/null +++ b/configs/accl/pr.py @@ -0,0 +1,78 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from sega import SEGA + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("alpha", type=float) + argparser.add_argument("threshold", type=float) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.cache_size, + args.graph, + args.alpha, + args.threshold, + args.verify, + ) + + +if __name__ == "__m5_main__": + num_gpts, cache_size, graph, alpha, threshold, verify = get_inputs() + + system = SEGA(num_gpts, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.create_pr_workload(alpha, threshold) + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if verify: + system.print_answer() diff --git a/configs/accl/real-graph-gen.py b/configs/accl/real-graph-gen.py index db44c63a9a..b943a925c1 100644 --- a/configs/accl/real-graph-gen.py +++ b/configs/accl/real-graph-gen.py @@ -28,14 +28,20 @@ import argparse import subprocess + def get_inputs(): argparser = argparse.ArgumentParser() argparser.add_argument("path", type=str, help="Path to the graph file.") - argparser.add_argument("num_gpts", type=int, help="Number gpts to create synth graph binaries for.") + argparser.add_argument( + "num_gpts", + type=int, + help="Number gpts to create synth graph binaries for.", + ) args = argparser.parse_args() return args.path, args.num_gpts + if __name__ == "__main__": graph_path, num_gpts = get_inputs() @@ -59,16 +65,29 @@ def get_inputs(): print(f"Created {graph_dir}/binaries/gpts_{num_gpts}") expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)] - if not all([binary in os.listdir(f"{graph_dir}/binaries/gpts_{num_gpts}") for binary in expected_bins]): - print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}") + if not all( + [ + binary in os.listdir(f"{graph_dir}/binaries/gpts_{num_gpts}") + for binary in expected_bins + ] + ): + print( + f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}" + ) for delete in os.scandir(f"{graph_dir}/binaries/gpts_{num_gpts}"): os.remove(delete.path) print(f"Deleted all the files in {graph_dir}/binaries/gpts_{num_gpts}") - subprocess.run([f"{graph_reader}" , - f"{graph_path}", - "false", - f"{num_gpts}", - "32", - f"{graph_dir}/binaries/gpts_{num_gpts}"]) - print(f"Created the graph binaries in " - f"{graph_dir}/binaries/gpts_{num_gpts}") + subprocess.run( + [ + f"{graph_reader}", + f"{graph_path}", + "false", + f"{num_gpts}", + "32", + f"{graph_dir}/binaries/gpts_{num_gpts}", + ] + ) + print( + f"Created the graph binaries in " + f"{graph_dir}/binaries/gpts_{num_gpts}" + ) diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py deleted file mode 100644 index 1c9276f0a0..0000000000 --- a/configs/accl/sega-hbm.py +++ /dev/null @@ -1,178 +0,0 @@ -# Copyright (c) 2022 The Regents of the University of California -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import m5 -import argparse - -from math import log -from m5.objects import * - -def interleave_addresses(plain_range, num_channels, cache_line_size): - intlv_low_bit = log(cache_line_size, 2) - intlv_bits = log(num_channels, 2) - ret = [] - for i in range(num_channels): - ret.append(AddrRange( - start=plain_range.start, - size=plain_range.size(), - intlvHighBit=intlv_low_bit + intlv_bits - 1, - xorHighBit=0, - intlvBits=intlv_bits, - intlvMatch=i)) - return ret, intlv_low_bit + intlv_bits - 1 - -class GPT(SubSystem): - def __init__(self, edge_memory_size: str, cache_size: str): - super().__init__() - self.wl_engine = WLEngine( - update_queue_size=128, - register_file_size=64 - ) - self.coalesce_engine = CoalesceEngine( - attached_memory_atom_size=32, - cache_size=cache_size, - num_mshr_entry=64, - num_tgts_per_mshr=64, - max_resp_per_cycle=8, - post_push_wb_queue_size=64 - ) - self.push_engine = PushEngine( - push_req_queue_size=32, - attached_memory_atom_size=64, - resp_queue_size=512, - update_queue_size=32 - ) - - self.vertex_mem_ctrl = HBMCtrl(dram=HBM_2000_4H_1x64(), - dram_2=HBM_2000_4H_1x64()) - - self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8( - range=AddrRange(edge_memory_size), - in_addr_map=False - ) - ) - - self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port - self.push_engine.mem_port = self.edge_mem_ctrl.port - - self.mpu = MPU( - wl_engine=self.wl_engine, - coalesce_engine=self.coalesce_engine, - push_engine=self.push_engine - ) - - def getRespPort(self): - return self.wl_engine.in_ports - def setRespPort(self, port): - self.wl_engine.in_ports = port - - def getReqPort(self): - return self.push_engine.out_ports - def setReqPort(self, port): - self.push_engine.out_ports = port - - def set_vertex_range(self, vertex_ranges): - self.vertex_mem_ctrl.dram.range = vertex_ranges[0] - self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1] - def set_vertex_pch_bit(self, pch_bit): - self.vertex_mem_ctrl.pch_bit = pch_bit - def set_edge_image(self, edge_image): - self.edge_mem_ctrl.dram.image_file = edge_image - -class SEGA(System): - def __init__(self, num_mpus, cache_size, graph_path): - super(SEGA, self).__init__() - self.clk_domain = SrcClockDomain() - self.clk_domain.clock = '2GHz' - self.clk_domain.voltage_domain = VoltageDomain() - self.cache_line_size = 32 - self.mem_mode = "timing" - - self.ctrl = CenteralController(image_file=f"{graph_path}/vertices") - - vertex_ranges, pch_bit = interleave_addresses( - AddrRange(start=0, size="4GiB"), - 2*num_mpus, - 32 - ) - - gpts = [] - for i in range(num_mpus): - gpt = GPT("2GiB", cache_size) - gpt.set_vertex_range([vertex_ranges[i], vertex_ranges[i+num_mpus]]) - gpt.set_vertex_pch_bit(pch_bit) - gpt.set_edge_image(f"{graph_path}/edgelist_{i}") - gpts.append(gpt) - # Creating the interconnect among mpus - for gpt_0 in gpts: - for gpt_1 in gpts: - gpt_0.setReqPort(gpt_1.getRespPort()) - self.gpts = gpts - - self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] - - def create_bfs_workload(self, init_addr, init_value): - self.ctrl.createBFSWorkload(init_addr, init_value) - - def create_pr_workload(self, alpha, threshold): - self.ctrl.createPRWorkload(alpha, threshold) - - def print_answer(self): - self.ctrl.printAnswerToHostSimout() - -def get_inputs(): - argparser = argparse.ArgumentParser() - argparser.add_argument("num_gpts", type=int) - argparser.add_argument("cache_size", type=str) - argparser.add_argument("graph", type=str) - argparser.add_argument("init_addr", type=int) - argparser.add_argument("init_value", type=int) - argparser.add_argument("--verify", type=bool, help="Print final answer") - - args = argparser.parse_args() - - verify = False - if not args.verify is None: - verify = args.verify - - return args.num_gpts, args.cache_size, \ - args.graph, args.init_addr, args.init_value, verify - -if __name__ == "__m5_main__": - num_gpts, cache_size, graph, init_addr, init_value, verify = get_inputs() - - system = SEGA(num_gpts, cache_size, graph) - root = Root(full_system = False, system = system) - - m5.instantiate() - - # system.create_bfs_workload(init_addr, init_value) - system.create_pr_workload(0.2, 0.0000001) - exit_event = m5.simulate() - print(f"Exited simulation at tick {m5.curTick()} " + \ - f"because {exit_event.getCause()}") - if verify: - system.print_answer() diff --git a/configs/accl/sega.py b/configs/accl/sega.py index c50c525297..42c07e2e94 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -24,100 +24,111 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import m5 -import argparse - from math import log from m5.objects import * + def interleave_addresses(plain_range, num_channels, cache_line_size): - intlv_low_bit = log(cache_line_size, 2) - intlv_bits = log(num_channels, 2) - ret = [] - for i in range(num_channels): - ret.append(AddrRange( + intlv_low_bit = log(cache_line_size, 2) + intlv_bits = log(num_channels, 2) + ret = [] + for i in range(num_channels): + ret.append( + AddrRange( start=plain_range.start, size=plain_range.size(), intlvHighBit=intlv_low_bit + intlv_bits - 1, xorHighBit=0, intlvBits=intlv_bits, - intlvMatch=i)) - return ret + intlvMatch=i, + ) + ) + return ret, intlv_low_bit + intlv_bits - 1 + class GPT(SubSystem): def __init__(self, edge_memory_size: str, cache_size: str): super().__init__() - self.wl_engine = WLEngine( - update_queue_size=128, - register_file_size=64 - ) + self.wl_engine = WLEngine(update_queue_size=128, register_file_size=64) self.coalesce_engine = CoalesceEngine( - attached_memory_atom_size=32, - cache_size=cache_size, - num_mshr_entry=64, - num_tgts_per_mshr=64, - max_resp_per_cycle=8 - ) + attached_memory_atom_size=32, + cache_size=cache_size, + num_mshr_entry=64, + num_tgts_per_mshr=64, + max_resp_per_cycle=8, + post_push_wb_queue_size=64, + ) self.push_engine = PushEngine( - push_req_queue_size=32, - attached_memory_atom_size=64, - resp_queue_size=64, - update_queue_size=32 - ) - - self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2)) - - self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8( - range=AddrRange(edge_memory_size), - in_addr_map=False - ) - ) + Xpush_req_queue_size=32, + attached_memory_atom_size=64, + resp_queue_size=512, + update_queue_size=32, + ) + + self.vertex_mem_ctrl = HBMCtrl( + dram=HBM_2000_4H_1x64(), dram_2=HBM_2000_4H_1x64() + ) + + self.edge_mem_ctrl = MemCtrl( + dram=DDR4_2400_8x8( + range=AddrRange(edge_memory_size), in_addr_map=False + ) + ) self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port self.push_engine.mem_port = self.edge_mem_ctrl.port self.mpu = MPU( - wl_engine=self.wl_engine, - coalesce_engine=self.coalesce_engine, - push_engine=self.push_engine - ) + wl_engine=self.wl_engine, + coalesce_engine=self.coalesce_engine, + push_engine=self.push_engine, + ) def getRespPort(self): return self.wl_engine.in_ports + def setRespPort(self, port): self.wl_engine.in_ports = port def getReqPort(self): return self.push_engine.out_ports + def setReqPort(self, port): self.push_engine.out_ports = port - def set_vertex_range(self, vertex_range): - self.vertex_mem_ctrl.dram.range = vertex_range + def set_vertex_range(self, vertex_ranges): + self.vertex_mem_ctrl.dram.range = vertex_ranges[0] + self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1] + + def set_vertex_pch_bit(self, pch_bit): + self.vertex_mem_ctrl.pch_bit = pch_bit + def set_edge_image(self, edge_image): self.edge_mem_ctrl.dram.image_file = edge_image + class SEGA(System): def __init__(self, num_mpus, cache_size, graph_path): super(SEGA, self).__init__() self.clk_domain = SrcClockDomain() - self.clk_domain.clock = '2GHz' + self.clk_domain.clock = "2GHz" self.clk_domain.voltage_domain = VoltageDomain() self.cache_line_size = 32 self.mem_mode = "timing" self.ctrl = CenteralController(image_file=f"{graph_path}/vertices") - vertex_ranges = interleave_addresses( - AddrRange(start=0, size="4GiB"), - num_mpus, - 32 - ) + vertex_ranges, pch_bit = interleave_addresses( + AddrRange(start=0, size="4GiB"), 2 * num_mpus, 32 + ) gpts = [] for i in range(num_mpus): - gpt = GPT("8GiB", cache_size) - gpt.set_vertex_range(vertex_ranges[i]) + gpt = GPT("2GiB", cache_size) + gpt.set_vertex_range( + [vertex_ranges[i], vertex_ranges[i + num_mpus]] + ) + gpt.set_vertex_pch_bit(pch_bit) gpt.set_edge_image(f"{graph_path}/edgelist_{i}") gpts.append(gpt) # Creating the interconnect among mpus @@ -128,31 +139,11 @@ def __init__(self, num_mpus, cache_size, graph_path): self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] - def create_initial_bfs_update(self, init_addr, init_value): - self.ctrl.createInitialBFSUpdate(init_addr, init_value) - -def get_inputs(): - argparser = argparse.ArgumentParser() - argparser.add_argument("num_gpts", type=int) - argparser.add_argument("cache_size", type=str) - argparser.add_argument("graph", type=str) - argparser.add_argument("init_addr", type=int) - argparser.add_argument("init_value", type=int) - - args = argparser.parse_args() - - return args.num_gpts, args.cache_size, \ - args.graph, args.init_addr, args.init_value - -if __name__ == "__m5_main__": - num_gpts, cache_size, graph, init_addr, init_value = get_inputs() - - system = SEGA(num_gpts, cache_size, graph) - root = Root(full_system = False, system = system) + def create_bfs_workload(self, init_addr, init_value): + self.ctrl.createBFSWorkload(init_addr, init_value) - m5.instantiate() + def create_pr_workload(self, alpha, threshold): + self.ctrl.createPRWorkload(alpha, threshold) - system.create_initial_bfs_update(init_addr, init_value) - exit_event = m5.simulate() - print(f"Exited simulation at tick {m5.curTick()} " + \ - f"because {exit_event.getCause()}") + def print_answer(self): + self.ctrl.printAnswerToHostSimout() diff --git a/configs/accl/synth-graph-gen.py b/configs/accl/synth-graph-gen.py index 16985b3537..15e4a6eff2 100644 --- a/configs/accl/synth-graph-gen.py +++ b/configs/accl/synth-graph-gen.py @@ -28,15 +28,27 @@ import argparse import subprocess + def get_inputs(): argparser = argparse.ArgumentParser() - argparser.add_argument("scale", type=int, help="The scale of the synth graph to generate.") - argparser.add_argument("deg", type=int, help="The average degree of the synth graph to generate.") - argparser.add_argument("num_gpts", type=int, help="Number gpts to create synth graph binaries for.") + argparser.add_argument( + "scale", type=int, help="The scale of the synth graph to generate." + ) + argparser.add_argument( + "deg", + type=int, + help="The average degree of the synth graph to generate.", + ) + argparser.add_argument( + "num_gpts", + type=int, + help="Number gpts to create synth graph binaries for.", + ) args = argparser.parse_args() return args.scale, args.deg, args.num_gpts + if __name__ == "__main__": scale, deg, num_gpts = get_inputs() @@ -62,18 +74,27 @@ def get_inputs(): for delete in os.scandir(graph_path): os.remove(delete.path) print(f"Deleted everything in {graph_path}") - subprocess.run([f"{graph_gen}", - f"{scale}", - f"{deg}", - f"{graph_path}/graph_unordered.txt"]) - print(f"Generated a graph with scale " - f"{scale} and deg {deg}") - subprocess.run(["python", - f"{graph_sorter}", - f"{graph_path}/graph_unordered.txt", - f"{graph_path}/graph.txt"]) - print(f"Sorted the graph here {graph_path}/graph_unordered.txt" - f" and saved in {graph_path}/graph.txt") + subprocess.run( + [ + f"{graph_gen}", + f"{scale}", + f"{deg}", + f"{graph_path}/graph_unordered.txt", + ] + ) + print(f"Generated a graph with scale " f"{scale} and deg {deg}") + subprocess.run( + [ + "python", + f"{graph_sorter}", + f"{graph_path}/graph_unordered.txt", + f"{graph_path}/graph.txt", + ] + ) + print( + f"Sorted the graph here {graph_path}/graph_unordered.txt" + f" and saved in {graph_path}/graph.txt" + ) subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"]) print(f"Deleted {graph_path}/graph_unordered.txt") @@ -88,16 +109,31 @@ def get_inputs(): print(f"Created {graph_path}/binaries/gpts_{num_gpts}") expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)] - if not all([binary in os.listdir(f"{graph_path}/binaries/gpts_{num_gpts}") for binary in expected_bins]): - print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}") + if not all( + [ + binary in os.listdir(f"{graph_path}/binaries/gpts_{num_gpts}") + for binary in expected_bins + ] + ): + print( + f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}" + ) for delete in os.scandir(f"{graph_path}/binaries/gpts_{num_gpts}"): os.remove(delete.path) - print(f"Deleted all the files in {graph_path}/binaries/gpts_{num_gpts}") - subprocess.run([f"{graph_reader}" , - f"{graph_path}/graph.txt", - "false", - f"{num_gpts}", - "32", - f"{graph_path}/binaries/gpts_{num_gpts}"]) - print(f"Created the graph binaries in " - f"{graph_path}/binaries/gpts_{num_gpts}") + print( + f"Deleted all the files in {graph_path}/binaries/gpts_{num_gpts}" + ) + subprocess.run( + [ + f"{graph_reader}", + f"{graph_path}/graph.txt", + "false", + f"{num_gpts}", + "32", + f"{graph_path}/binaries/gpts_{num_gpts}", + ] + ) + print( + f"Created the graph binaries in " + f"{graph_path}/binaries/gpts_{num_gpts}" + ) diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 9f7e5fc4c5..e362d605c0 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -174,7 +174,9 @@ uint32_t PRWorkload::propagate(uint32_t value, uint32_t weight) { float value_float = writeToFloat(value); - float weight_float = writeToFloat(1); + float weight_float = 1.0; + float delta = alpha * value_float * weight_float; + return readFromFloat(alpha * value_float * weight_float); } @@ -198,14 +200,14 @@ PRWorkload::preWBApply(WorkListItem& wl) std::tuple PRWorkload::prePushApply(WorkListItem& wl) -{ +{ if (applyCondition(wl)) { float temp_float = writeToFloat(wl.tempProp); float prop_float = writeToFloat(wl.prop); float delta = (temp_float - prop_float) / wl.degree; - std::cout << "PRWorkload: delta: " << delta << std::endl; + uint32_t delta_uint = readFromFloat(delta); wl.prop = wl.tempProp; - return std::make_tuple(delta, true, true); + return std::make_tuple(delta_uint, true, true); } return std::make_tuple(0, false, false); } diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 0d1eecf43f..2f6555602c 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -196,7 +196,7 @@ CoalesceEngine::recvWLRead(Addr addr) cacheBlocks[block_index].pendingApply = false; cacheBlocks[block_index].pendingWB = false; // HACK: If a read happens on the same cycle as another operation such - // apply setLastChangedTick to half a cycle later so that operations + // as apply set lastChangedTick to half a cycle later so that operation // scheduled by the original operation (apply in this example) are // invalidated. For more details refer to "accl/graph/sega/busyMaskErr" cacheBlocks[block_index].lastChangedTick = @@ -478,7 +478,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) bool do_push, do_wb_v; std::tie(delta, do_push, do_wb_v) = graphWorkload->prePushApply(items[i]); - std::cout << "CoalesceEngine: delta: " << delta << std::endl; do_wb |= do_wb_v; if (do_push) { owner->recvVertexPush(vertex_addr, delta, @@ -517,7 +516,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) cacheBlocks[block_index].valid = true; cacheBlocks[block_index].needsWB |= do_wb; cacheBlocks[block_index].pendingData = false; - cacheBlocks[block_index].lastChangedTick = curTick(); + // HACK: In case processNextRead is called on the same tick as curTick + // and is scheduled to read to the same cacheBlocks[block_index] + cacheBlocks[block_index].lastChangedTick = + curTick() + (Tick) (clockPeriod() / 2); } else if (do_wb) { PacketPtr wb_pkt = createWritePacket( addr, peerMemoryAtomSize, (uint8_t*) items); @@ -564,7 +566,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) responseQueue.size()); // TODO: Add a stat to count the number of WLItems that have been touched. cacheBlocks[block_index].busyMask |= (1 << wl_offset); - cacheBlocks[block_index].lastChangedTick = curTick(); + // cacheBlocks[block_index].lastChangedTick = curTick(); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); it = MSHR[block_index].erase(it); @@ -608,8 +610,8 @@ CoalesceEngine::processNextResponseEvent() num_responses_sent++; DPRINTF(CoalesceEngine, "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n", - __func__, - graphWorkload->printWorkListItem(worklist_response), + __func__, + graphWorkload->printWorkListItem(worklist_response), addr_response); responseQueue.pop_front(); @@ -652,7 +654,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); DPRINTF(CoalesceEngine, "%s: Received a write for WorkListItem: %s " - "with Addr: %lu.\n", __func__, + "with Addr: %lu.\n", __func__, graphWorkload->printWorkListItem(wl), addr); // Desing does not allow for write misses for now. assert(cacheBlocks[block_index].addr == aligned_addr); @@ -874,8 +876,11 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); // A cache block should not be touched while it's waiting for data. - assert(schedule_tick == cacheBlocks[block_index].lastChangedTick); - // + // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick); + + if (cacheBlocks[block_index].lastChangedTick != schedule_tick) { + return; + } assert(!cacheBlocks[block_index].valid); assert(cacheBlocks[block_index].busyMask == 0); @@ -1124,7 +1129,6 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) bool do_push, do_wb; std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply( cacheBlocks[block_index].items[wl_offset]); - std::cout << "CoalesceEngine: delta: " << delta << std::endl; cacheBlocks[block_index].needsWB |= do_wb; if (do_push) { owner->recvVertexPush(vertex_addr, delta, From ffbef8e2cf85c635d8814ccf1951ea145a968fb6 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 26 Oct 2022 07:46:18 -0700 Subject: [PATCH 206/247] Fixing typos. --- configs/accl/bfs.py | 8 ++++---- configs/accl/sega.py | 2 +- src/accl/graph/base/graph_workload.cc | 1 - 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py index d02faa96ca..fc32b96642 100644 --- a/configs/accl/bfs.py +++ b/configs/accl/bfs.py @@ -37,8 +37,8 @@ def get_inputs(): argparser.add_argument("num_gpts", type=int) argparser.add_argument("cache_size", type=str) argparser.add_argument("graph", type=str) - argparser.add_argument("init_addr", type=float) - argparser.add_argument("init_value", type=float) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) argparser.add_argument( "--verify", dest="verify", @@ -54,8 +54,8 @@ def get_inputs(): args.num_gpts, args.cache_size, args.graph, - args.alpha, - args.threshold, + args.init_addr, + args.init_value, args.verify, ) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 42c07e2e94..0f4b133791 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -59,7 +59,7 @@ def __init__(self, edge_memory_size: str, cache_size: str): post_push_wb_queue_size=64, ) self.push_engine = PushEngine( - Xpush_req_queue_size=32, + push_req_queue_size=32, attached_memory_atom_size=64, resp_queue_size=512, update_queue_size=32, diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index e362d605c0..44136cb4c1 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -175,7 +175,6 @@ PRWorkload::propagate(uint32_t value, uint32_t weight) { float value_float = writeToFloat(value); float weight_float = 1.0; - float delta = alpha * value_float * weight_float; return readFromFloat(alpha * value_float * weight_float); } From fe146055cc230e532d878a66cd0c1577a81234f3 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 27 Oct 2022 14:24:18 -0700 Subject: [PATCH 207/247] Adding sample script. --- configs/accl/pr-sample.py | 109 +++++++++++++++++++++++++ src/accl/graph/sega/coalesce_engine.cc | 2 +- 2 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 configs/accl/pr-sample.py diff --git a/configs/accl/pr-sample.py b/configs/accl/pr-sample.py new file mode 100644 index 0000000000..ac3616dc84 --- /dev/null +++ b/configs/accl/pr-sample.py @@ -0,0 +1,109 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from sega import SEGA + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("alpha", type=float) + argparser.add_argument("threshold", type=float) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 10us", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.cache_size, + args.graph, + args.alpha, + args.threshold, + args.verify, + args.sample, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + cache_size, + graph, + alpha, + threshold, + verify, + sample, + ) = get_inputs() + + system = SEGA(num_gpts, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.create_pr_workload(alpha, threshold) + + if sample: + while True: + exit_event = m5.simulate(10000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + print(exit_event.getCause()) + if exit_event.getCause() != "simulate() limit reached": + break + else: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if verify: + system.print_answer() diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 2f6555602c..1dbe2a0d56 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -519,7 +519,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) // HACK: In case processNextRead is called on the same tick as curTick // and is scheduled to read to the same cacheBlocks[block_index] cacheBlocks[block_index].lastChangedTick = - curTick() + (Tick) (clockPeriod() / 2); + curTick() - (Tick) (clockPeriod() / 2); } else if (do_wb) { PacketPtr wb_pkt = createWritePacket( addr, peerMemoryAtomSize, (uint8_t*) items); From 151a02fbe697abb0713b99c0ff72fa4f16bf63b1 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 28 Oct 2022 11:02:32 -0700 Subject: [PATCH 208/247] Fixing sim performance issue. --- src/accl/graph/base/graph_workload.cc | 8 ++++++-- src/accl/graph/base/graph_workload.hh | 9 ++++++--- src/accl/graph/sega/coalesce_engine.cc | 7 +++++-- src/accl/graph/sega/coalesce_engine.hh | 18 ++++++++++++++++-- 4 files changed, 33 insertions(+), 9 deletions(-) diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 44136cb4c1..07accff44f 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -68,7 +68,8 @@ BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size) void BFSWorkload::init(PacketPtr pkt, int bit_index_base, std::bitset& needsPush, - std::deque& activeBits) + std::deque& activeBits, + int& _workCount) { if (pkt->getAddr() == initAddrBase) { WorkListItem items[numElementsPerLine]; @@ -80,6 +81,7 @@ BFSWorkload::init(PacketPtr pkt, int bit_index_base, if (items[initIndex].degree > 0) { needsPush[bit_index_base + initIndex] = 1; activeBits.push_back(bit_index_base + initIndex); + _workCount++; } pkt->deleteData(); @@ -144,7 +146,8 @@ PRWorkload::PRWorkload(float alpha, float threshold, int atom_size): void PRWorkload::init(PacketPtr pkt, int bit_index_base, std::bitset& needsPush, - std::deque& activeBits) + std::deque& activeBits, + int& _workCount) { WorkListItem items[numElementsPerLine]; @@ -155,6 +158,7 @@ PRWorkload::init(PacketPtr pkt, int bit_index_base, if (items[i].degree > 0) { needsPush[bit_index_base + i] = 1; activeBits.push_back(bit_index_base + i); + _workCount++; } } pkt->deleteData(); diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh index c391a80c23..6bbc4935c2 100644 --- a/src/accl/graph/base/graph_workload.hh +++ b/src/accl/graph/base/graph_workload.hh @@ -48,7 +48,8 @@ class GraphWorkload virtual void init(PacketPtr pkt, int bit_index_base, std::bitset& needsPush, - std::deque& activeBits) = 0; + std::deque& activeBits, + int& _workCount) = 0; virtual uint32_t reduce(uint32_t update, uint32_t value) = 0; virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0; virtual bool applyCondition(WorkListItem wl) = 0; @@ -73,7 +74,8 @@ class BFSWorkload : public GraphWorkload virtual void init(PacketPtr pkt, int bit_index_base, std::bitset& needsPush, - std::deque& activeBits); + std::deque& activeBits, + int& _workCount); virtual uint32_t reduce(uint32_t update, uint32_t value); virtual uint32_t propagate(uint32_t value, uint32_t weight); virtual bool applyCondition(WorkListItem wl); @@ -99,7 +101,8 @@ class PRWorkload : public GraphWorkload virtual void init(PacketPtr pkt, int bit_index_base, std::bitset& needsPush, - std::deque& activeBits); + std::deque& activeBits, + int& _workCount); virtual uint32_t reduce(uint32_t update, uint32_t value); virtual uint32_t propagate(uint32_t value, uint32_t weight); virtual bool applyCondition(WorkListItem wl); diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 1dbe2a0d56..38f05f937a 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -47,7 +47,7 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr), - maxRespPerCycle(params.max_resp_per_cycle), + maxRespPerCycle(params.max_resp_per_cycle), _workCount(0), numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size), maxPotentialPostPushWB(0), nextMemoryEvent([this] { @@ -102,7 +102,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt) } else { // TODO: Add and implement init function for GraphWorkload. int bit_index_base = getBitIndexBase(pkt->getAddr()); - graphWorkload->init(pkt, bit_index_base, needsPush, activeBits); + graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount); memPort.sendFunctional(pkt); } } @@ -473,6 +473,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) if (vertex_send_mask != 0) { assert(needsPush[it + i] == 1); needsPush[it + i] = 0; + _workCount--; uint32_t delta; bool do_push, do_wb_v; @@ -784,6 +785,7 @@ CoalesceEngine::processNextPreWBApplyEvent() int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr); if (needsPush[bit_index_base + index] == 0) { needsPush[bit_index_base + index] = 1; + _workCount++; activeBits.push_back(bit_index_base + index); if (!owner->running()) { owner->start(); @@ -1124,6 +1126,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) int slice_base_index = getBitIndexBase(addr); needsPush[slice_base_index + wl_offset] = 0; + _workCount--; uint32_t delta; bool do_push, do_wb; diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index c8fec38e5b..64c5c4af46 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -52,6 +52,17 @@ enum BitStatus NUM_STATUS }; +enum CacheState +{ + INVALID, + PENDING_DATA, + BUSY, + IDLE, + PENDING_PRE_WB_APPLY, + PENDING_WB, + NUM_CACHE_STATE +}; + class MPU; class CoalesceEngine : public BaseMemoryEngine @@ -69,6 +80,7 @@ class CoalesceEngine : public BaseMemoryEngine bool pendingApply; bool pendingWB; Tick lastChangedTick; + CacheState state; // TODO: This might be useful in the future // Tick lastWLWriteTick; Block() {} @@ -81,7 +93,8 @@ class CoalesceEngine : public BaseMemoryEngine pendingData(false), pendingApply(false), pendingWB(false), - lastChangedTick(0) + lastChangedTick(0), + state(CacheState::INVALID) { items = new WorkListItem [num_elements]; } @@ -116,6 +129,7 @@ class CoalesceEngine : public BaseMemoryEngine int maxRespPerCycle; std::deque> responseQueue; + int _workCount; int numPullsReceived; UniqueFIFO applyQueue; std::bitset needsPush; @@ -206,7 +220,7 @@ class CoalesceEngine : public BaseMemoryEngine bool recvWLRead(Addr addr); void recvWLWrite(Addr addr, WorkListItem wl); - int workCount() { return needsPush.count(); } + int workCount() { return _workCount; } void recvVertexPull(); bool done(); From 82d076c4bc2efca79614cb40f08ec080bd8ac7ac Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 31 Oct 2022 09:53:00 -0700 Subject: [PATCH 209/247] Fixing write miss issue. --- src/accl/graph/sega/coalesce_engine.cc | 92 ++++++++++++++------------ src/accl/graph/sega/coalesce_engine.hh | 30 ++++++++- 2 files changed, 76 insertions(+), 46 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 38f05f937a..7a064c1c2f 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -495,6 +495,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) maxPotentialPostPushWB--; } + bool cache_wb = false; if (cacheBlocks[block_index].addr == addr) { DPRINTF(CoalesceEngine, "%s: Received read response to " "fill cacheBlocks[%d].\n", __func__, block_index); @@ -521,6 +522,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) // and is scheduled to read to the same cacheBlocks[block_index] cacheBlocks[block_index].lastChangedTick = curTick() - (Tick) (clockPeriod() / 2); + cache_wb = true; } else if (do_wb) { PacketPtr wb_pkt = createWritePacket( addr, peerMemoryAtomSize, (uint8_t*) items); @@ -537,42 +539,44 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) // TODO: Add a stat to count this. // FIXME: This is not a totally wasteful read. e.g. all reads // for pull in BFS are like this. - DPRINTF(CoalesceEngine, "%s: Totally wasteful read.\n", __func__); + DPRINTF(CoalesceEngine, "%s: No write destination for addr: %lu.\n", __func__, addr); } - for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { - Addr miss_addr = *it; - Addr aligned_miss_addr = - roundDown(miss_addr, peerMemoryAtomSize); - - if (aligned_miss_addr == addr) { - int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); - DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " - "cacheBlocks[%d] can be serviced with the received " - "packet.\n",__func__, miss_addr, block_index); - // TODO: Make this block of code into a function - responseQueue.push_back(std::make_tuple(miss_addr, - cacheBlocks[block_index].items[wl_offset], curTick())); - DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " - "to responseQueue. responseQueue.size = %d.\n", - __func__, miss_addr, - graphWorkload->printWorkListItem( - cacheBlocks[block_index].items[wl_offset]), - responseQueue.size()); - DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " - "to responseQueue. responseQueue.size = %d.\n", - __func__, addr, - graphWorkload->printWorkListItem( - cacheBlocks[block_index].items[wl_offset]), - responseQueue.size()); - // TODO: Add a stat to count the number of WLItems that have been touched. - cacheBlocks[block_index].busyMask |= (1 << wl_offset); - // cacheBlocks[block_index].lastChangedTick = curTick(); - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - it = MSHR[block_index].erase(it); - } else { - it++; + if (cache_wb) { + for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { + Addr miss_addr = *it; + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + + if (aligned_miss_addr == addr) { + int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " + "cacheBlocks[%d] can be serviced with the received " + "packet.\n",__func__, miss_addr, block_index); + // TODO: Make this block of code into a function + responseQueue.push_back(std::make_tuple(miss_addr, + cacheBlocks[block_index].items[wl_offset], curTick())); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + // TODO: Add a stat to count the number of WLItems that have been touched. + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + // cacheBlocks[block_index].lastChangedTick = curTick(); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + it = MSHR[block_index].erase(it); + } else { + it++; + } } } @@ -1045,7 +1049,7 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick) } } -std::tuple +std::tuple CoalesceEngine::getOptimalPullAddr() { int visited_bits = 0; @@ -1066,7 +1070,7 @@ CoalesceEngine::getOptimalPullAddr() assert(vertex_send_mask == 0); activeBits.pop_front(); return std::make_tuple( - BitStatus::PENDING_READ, addr, index_offset); + WorkLocation::PENDING_READ, addr, index_offset); } else { // Only if it is in cache and it is in idle state. if ((cacheBlocks[block_index].addr == addr) && @@ -1078,12 +1082,12 @@ CoalesceEngine::getOptimalPullAddr() assert(!cacheBlocks[block_index].pendingData); activeBits.pop_front(); return std::make_tuple( - BitStatus::IN_CACHE, block_index, index_offset); + WorkLocation::IN_CACHE, block_index, index_offset); // Otherwise if it is in memory } else if ((cacheBlocks[block_index].addr != addr)) { activeBits.pop_front(); return std::make_tuple( - BitStatus::IN_MEMORY, addr, index_offset); + WorkLocation::IN_MEMORY, addr, index_offset); } } activeBits.pop_front(); @@ -1091,20 +1095,20 @@ CoalesceEngine::getOptimalPullAddr() visited_bits++; } - return std::make_tuple(BitStatus::GARBAGE, 0, 0); + return std::make_tuple(WorkLocation::GARBAGE, 0, 0); } void CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) { - BitStatus bit_status; + WorkLocation bit_status; Addr location; int offset; std::tie(bit_status, location, offset) = getOptimalPullAddr(); - if (bit_status != BitStatus::GARBAGE) { - if (bit_status == BitStatus::PENDING_READ) { + if (bit_status != WorkLocation::GARBAGE) { + if (bit_status == WorkLocation::PENDING_READ) { // renaming the outputs to thier local names. Addr addr = location; int index_offset = offset; @@ -1116,7 +1120,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) pendingVertexPullReads[addr] = send_mask; numPullsReceived--; } - if (bit_status == BitStatus::IN_CACHE) { + if (bit_status == WorkLocation::IN_CACHE) { // renaming the outputs to their local names. int block_index = (int) location; int wl_offset = offset; @@ -1145,7 +1149,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) stats.lastVertexPushTime = curTick() - stats.lastResetTick; numPullsReceived--; } - if (bit_status == BitStatus::IN_MEMORY) { + if (bit_status == WorkLocation::IN_MEMORY) { if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) { Addr addr = location; int index_offset = offset; diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 64c5c4af46..05e268270a 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -43,7 +43,7 @@ namespace gem5 { -enum BitStatus +enum WorkLocation { PENDING_READ, IN_CACHE, @@ -65,6 +65,32 @@ enum CacheState class MPU; + +// TODO: Add active bit to WorkListItem class. Check active bit before activate +// Only activate if necessary and not active before. +class WorkDirectory +{ + private: + CoalesceEngine* owner; + Addr memoryAtomSize; + int atomBlockSize; + size_t elementSize; + + int _workCount; + public: + AddrRange memoryRange; + WorkDirectory(Addr atom_size, int block_size, size_t element_size): + memoryAtomSize(atom_size), atomBlockSize(block_size), + elementSize(element_size), _workCount(0) + {} + + void activate(Addr addr); + void deactivate(Addr addr); + int workCount(); + std::tuple getNextWork(); + +}; + class CoalesceEngine : public BaseMemoryEngine { private: @@ -140,7 +166,7 @@ class CoalesceEngine : public BaseMemoryEngine int getBlockIndex(Addr addr); int getBitIndexBase(Addr addr); Addr getBlockAddrFromBitIndex(int index); - std::tuple getOptimalPullAddr(); + std::tuple getOptimalPullAddr(); int maxPotentialPostPushWB; // A map from addr to sendMask. sendMask determines which bytes to From f217715d8eae9774027635e6652755cdeaab0c00 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 1 Nov 2022 00:15:16 -0700 Subject: [PATCH 210/247] Restructuring the cache. --- src/accl/graph/base/data_structs.hh | 17 +- src/accl/graph/sega/CoalesceEngine.py | 2 - src/accl/graph/sega/CoalesceEngine_bak.py | 50 + src/accl/graph/sega/coalesce_engine.cc | 553 +++------ src/accl/graph/sega/coalesce_engine.hh | 107 +- src/accl/graph/sega/coalesce_engine_bak.cc | 1308 ++++++++++++++++++++ src/accl/graph/sega/coalesce_engine_bak.hh | 218 ++++ 7 files changed, 1834 insertions(+), 421 deletions(-) create mode 100644 src/accl/graph/sega/CoalesceEngine_bak.py create mode 100644 src/accl/graph/sega/coalesce_engine_bak.cc create mode 100644 src/accl/graph/sega/coalesce_engine_bak.hh diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index d9028e2f10..070e635736 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -45,29 +45,33 @@ struct __attribute__ ((packed)) WorkListItem { uint32_t tempProp : 32; uint32_t prop : 32; - uint32_t degree : 32; uint32_t edgeIndex : 32; + uint32_t degree : 31; + bool active: 1; std::string to_string() { return csprintf( - "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}", - tempProp, prop, degree, edgeIndex); + "WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, " + "degree: %u, active: %s}", tempProp, prop, edgeIndex, degree, + active ? "true" : "false"); } WorkListItem(): tempProp(0), prop(0), + edgeIndex(0), degree(0), - edgeIndex(0) + active(false) {} WorkListItem(uint32_t temp_prop, uint32_t prop, - uint32_t degree, uint32_t edge_index): + uint32_t edge_index, uint32_t degree, bool active): tempProp(temp_prop), prop(prop), + edgeIndex(edge_index), degree(degree), - edgeIndex(edge_index) + active(active) {} }; @@ -88,7 +92,6 @@ struct __attribute__ ((packed)) Edge weight(weight), neighbor(neighbor) {} - }; static_assert(isPowerOf2(sizeof(WorkListItem))); diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index 1fd3b968c5..8ec9214b49 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -38,8 +38,6 @@ class CoalesceEngine(BaseMemoryEngine): num_mshr_entry = Param.Int("Number of MSHR entries.") - num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.") - max_resp_per_cycle = Param.Int("Maximum number of vertices to send to " "requestor in each cycle. Used to limit b/w.") diff --git a/src/accl/graph/sega/CoalesceEngine_bak.py b/src/accl/graph/sega/CoalesceEngine_bak.py new file mode 100644 index 0000000000..1fd3b968c5 --- /dev/null +++ b/src/accl/graph/sega/CoalesceEngine_bak.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.BaseMemoryEngine import BaseMemoryEngine + +class CoalesceEngine(BaseMemoryEngine): + type = 'CoalesceEngine' + cxx_header = "accl/graph/sega/coalesce_engine.hh" + cxx_class = 'gem5::CoalesceEngine' + + cache_size = Param.MemorySize("Size of the internal SRAM array.") + + num_mshr_entry = Param.Int("Number of MSHR entries.") + + num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.") + + max_resp_per_cycle = Param.Int("Maximum number of vertices to send to " + "requestor in each cycle. Used to limit b/w.") + + post_push_wb_queue_size = Param.Int("Maximum number of pending wb after " + "apply process for applications that require " + "the apply process to happen exactly before " + "pushing the edgePointer to the PushEngine.") + diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 7a064c1c2f..66ff66c068 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -46,10 +46,10 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): numLines((int) (params.cache_size / peerMemoryAtomSize)), numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry), - numTgtsPerMSHR(params.num_tgts_per_mshr), - maxRespPerCycle(params.max_resp_per_cycle), _workCount(0), - numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size), - maxPotentialPostPushWB(0), + maxRespPerCycle(params.max_resp_per_cycle), cacheWorkCount(0), + numPullsReceived(0), activeBufferSize(params.post_push_wb_queue_size), + postPushWBQueueSize(params.post_push_wb_queue_size), + pendingPullReads(0), nextMemoryEvent([this] { processNextMemoryEvent(); }, name() + ".nextMemoryEvent"), @@ -59,6 +59,9 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): nextPreWBApplyEvent([this] { processNextPreWBApplyEvent(); }, name() + ".nextPreWBApplyEvent"), + nextPrePushApplyEvent([this] { + processNextPrePushApplyEvent(); + }, name() + ".nextPrePushApplyEvent"), stats(*this) { assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine)); @@ -66,7 +69,6 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): for (int i = 0; i < numLines; i++) { cacheBlocks[i] = Block(numElementsPerLine); } - needsPush.reset(); } void @@ -83,15 +85,10 @@ CoalesceEngine::recvFunctional(PacketPtr pkt) Addr addr = pkt->getAddr(); int block_index = getBlockIndex(addr); + // TODO: Check postPushWBQueue for hits if ((cacheBlocks[block_index].addr == addr) && (cacheBlocks[block_index].valid)) { - assert(cacheBlocks[block_index].busyMask == 0); - assert(!cacheBlocks[block_index].needsApply); - // NOTE: No need to check needsWB because there might be entries - // that have been updated and not written back in the cache. - // assert(!cacheBlocks[block_index].needsWB); - assert(!cacheBlocks[block_index].pendingApply); - assert(!cacheBlocks[block_index].pendingWB); + assert(cacheBlocks[block_index].state == CacheState::IDLE); pkt->makeResponse(); pkt->setDataFromBlock( @@ -100,8 +97,8 @@ CoalesceEngine::recvFunctional(PacketPtr pkt) memPort.sendFunctional(pkt); } } else { - // TODO: Add and implement init function for GraphWorkload. int bit_index_base = getBitIndexBase(pkt->getAddr()); + // FIXME: Pass workdirectory to graphworkload.init graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount); memPort.sendFunctional(pkt); } @@ -110,6 +107,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt) bool CoalesceEngine::done() { + // FIXME: Fix this later return applyQueue.empty() && needsPush.none() && memoryFunctionQueue.empty() && (onTheFlyReqs == 0); } @@ -123,6 +121,8 @@ CoalesceEngine::getBlockIndex(Addr addr) return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines; } +// FIXME: This and the next function should be moved to the +// WorkDirectory. // addr should be aligned to peerMemoryAtomSize int CoalesceEngine::getBitIndexBase(Addr addr) @@ -134,6 +134,7 @@ CoalesceEngine::getBitIndexBase(Addr addr) return atom_index * block_bits; } +// FIXME: Read FIXME: Above // index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem)) Addr CoalesceEngine::getBlockAddrFromBitIndex(int index) @@ -161,17 +162,10 @@ CoalesceEngine::recvWLRead(Addr addr) if ((cacheBlocks[block_index].addr == aligned_addr) && (cacheBlocks[block_index].valid)) { + // Hit DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit.\n", __func__, addr); stats.readHits++; - assert(!cacheBlocks[block_index].pendingData); - // No cache block could be in pendingApply and pendingWB at the - // same time. - assert(!(cacheBlocks[block_index].pendingApply && - cacheBlocks[block_index].pendingWB)); - // Hit - // TODO: Add a hit latency as a param for this object. - // Can't just schedule the nextResponseEvent for latency cycles in - // the future. + assert(cacheBlocks[block_index].state != CacheState::INVALID); responseQueue.push_back(std::make_tuple( addr, cacheBlocks[block_index].items[wl_offset], curTick())); @@ -189,12 +183,7 @@ CoalesceEngine::recvWLRead(Addr addr) responseQueue.size()); // TODO: Stat to count the number of WLItems that have been touched. cacheBlocks[block_index].busyMask |= (1 << wl_offset); - // If they are scheduled for apply and WB those schedules should be - // discarded. Since there is no easy way to take items out of the - // function queue. Those functions check for their respective bits - // and skip the process if the respective bit is set to false. - cacheBlocks[block_index].pendingApply = false; - cacheBlocks[block_index].pendingWB = false; + cacheBlocks[block_index].state = CacheState::BUSY; // HACK: If a read happens on the same cycle as another operation such // as apply set lastChangedTick to half a cycle later so that operation // scheduled by the original operation (apply in this example) are @@ -210,34 +199,20 @@ CoalesceEngine::recvWLRead(Addr addr) stats.numVertexReads++; return true; } else if ((cacheBlocks[block_index].addr == aligned_addr) && - (cacheBlocks[block_index].pendingData)) { + (cacheBlocks[block_index].state == CacheState::PENDING_DATA)) { // Hit under miss DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit under miss.\n", __func__, addr); stats.readHitUnderMisses++; assert(!cacheBlocks[block_index].valid); assert(cacheBlocks[block_index].busyMask == 0); - assert(!cacheBlocks[block_index].needsWB); - assert(!cacheBlocks[block_index].needsApply); - assert(!cacheBlocks[block_index].pendingApply); - assert(!cacheBlocks[block_index].pendingWB); + assert(!cacheBlocks[block_index].dirty); + assert(!cacheBlocks[block_index].needsPreWBApply); assert(MSHR.size() <= numMSHREntries); assert(MSHR.find(block_index) != MSHR.end()); - assert(MSHR[block_index].size() <= numTgtsPerMSHR); - if (MSHR[block_index].size() == numTgtsPerMSHR) { - DPRINTF(CoalesceEngine, "%s: Out of targets for " - "cacheBlocks[%d]. Rejecting request.\n", - __func__, block_index); - stats.mshrTargetShortage++; - return false; - } else { - DPRINTF(CoalesceEngine, "%s: MSHR entries are available for " - "cacheBlocks[%d].\n", __func__, block_index); - } MSHR[block_index].push_back(addr); - stats.mshrEntryLength.sample(MSHR[block_index].size()); - DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to MSHR " "for cacheBlocks[%d].\n", __func__, addr, block_index); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); @@ -245,195 +220,52 @@ CoalesceEngine::recvWLRead(Addr addr) return true; } else { // miss - // FIXME: Make this assert work. It will break if the cache block - // is cold and addr or aligned_addr is 0. It fails because cache block - // addr field is initialized to 0. Unfortunately Addr type is unsigned. - // So you can not initialized addr to -1. assert(cacheBlocks[block_index].addr != aligned_addr); assert(MSHR.size() <= numMSHREntries); DPRINTF(CoalesceEngine, "%s: Addr: %lu is a miss.\n", __func__, addr); - if (MSHR.find(block_index) == MSHR.end()) { - DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for Addr:" - " %lu not found in MSHRs.\n", __func__, block_index, addr); - if (MSHR.size() == numMSHREntries) { - // Out of MSHR entries - DPRINTF(CoalesceEngine, "%s: Out of MSHR entries. " - "Rejecting request.\n", __func__); - // TODO: Break out read rejections into more than one stat - // based on the cause of the rejection - stats.mshrEntryShortage++; - return false; - } else { - DPRINTF(CoalesceEngine, "%s: MSHR " - "entries available.\n", __func__); - if ((cacheBlocks[block_index].valid) || - (cacheBlocks[block_index].pendingData)) { - DPRINTF(CoalesceEngine, "%s: Addr: %lu has a conflict " - "with Addr: %lu.\n", __func__, addr, - cacheBlocks[block_index].addr); - if ((cacheBlocks[block_index].valid) && - (cacheBlocks[block_index].busyMask == 0) && - (!cacheBlocks[block_index].pendingApply) && - (!cacheBlocks[block_index].pendingWB)) { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in " - "idle state.\n", __func__, block_index); - // We're in idle state - // Idle: valid && !pendingApply && !pendingWB; - // Note 0: needsApply has to be false. Because - // A cache line enters the idle state from two - // other states. First a busy state that does not - // need apply (needsApply is already false) or - // from pendingApplyState after being applied which - // clears the needsApply bit. needsApply is useful - // when a cache block has transitioned from - // pendingApply to busy without the apply happening. - // Note 1: pendingData does not have to be evaluated - // becuase pendingData is cleared when data - // arrives from the memory and valid does not - // denote cleanliness of the line. Rather it - // is used to differentiate between empty blocks - // and the blocks that have data from memory. - // pendingData denotes the transient state between - // getting a miss and getting the data for that miss. - // valid basically means that the data in the cache - // could be used to respond to read/write requests. - assert(!cacheBlocks[block_index].needsApply); - assert(!cacheBlocks[block_index].pendingData); - // There are no conflicts in idle state. - assert(MSHR.find(block_index) == MSHR.end()); - if (cacheBlocks[block_index].needsWB) { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs" - "to be written back.\n", __func__, block_index); - cacheBlocks[block_index].pendingWB = true; - cacheBlocks[block_index].lastChangedTick = curTick(); - memoryFunctionQueue.emplace_back( - [this] (int block_index, Tick schedule_tick) { - processNextWriteBack(block_index, schedule_tick); - }, block_index, curTick()); - DPRINTF(CoalesceEngine, "%s: Pushed " - "processNextWriteBack for input " - "%d to memoryFunctionQueue.\n", - __func__, block_index); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); - } - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: " - "%s.\n", __func__, block_index, - cacheBlocks[block_index].to_string()); - } else { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does " - "not need to be written back.\n", - __func__, block_index); - cacheBlocks[block_index].addr = aligned_addr; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].busyMask = 0; - cacheBlocks[block_index].needsWB = false; - cacheBlocks[block_index].needsApply = false; - cacheBlocks[block_index].pendingData = true; - cacheBlocks[block_index].pendingApply = false; - cacheBlocks[block_index].pendingWB = false; - cacheBlocks[block_index].lastChangedTick = curTick(); - memoryFunctionQueue.emplace_back( - [this] (int block_index, Tick schedule_tick) { - processNextRead(block_index, schedule_tick); - }, block_index, curTick()); - DPRINTF(CoalesceEngine, "%s: Pushed " - "processNextRead for input " - "%d to memoryFunctionQueue.\n", - __func__, block_index); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); - } - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: " - "%s.\n", __func__, block_index, - cacheBlocks[block_index].to_string()); - } - } - // cacheBlocks[block_index].hasConflict = true; - MSHR[block_index].push_back(addr); - stats.mshrEntryLength.sample(MSHR[block_index].size()); - DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " - "for cacheBlocks[%d].\n", __func__, addr, block_index); - stats.readMisses++; - // TODO: Add readConflicts here. - stats.numVertexReads++; - return true; - } else { - // MSHR available and no conflict - DPRINTF(CoalesceEngine, "%s: Addr: %lu has no conflict. " - "Allocating a cache line for it.\n" - , __func__, addr); - assert(!cacheBlocks[block_index].valid); - assert(cacheBlocks[block_index].busyMask == 0); - assert(!cacheBlocks[block_index].needsWB); - assert(!cacheBlocks[block_index].needsApply); - assert(!cacheBlocks[block_index].pendingData); - assert(!cacheBlocks[block_index].pendingApply); - assert(!cacheBlocks[block_index].pendingWB); - assert(MSHR[block_index].size() == 0); - - cacheBlocks[block_index].addr = aligned_addr; - cacheBlocks[block_index].busyMask = 0; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].needsWB = false; - cacheBlocks[block_index].needsApply = false; - cacheBlocks[block_index].pendingData = true; - cacheBlocks[block_index].pendingApply = false; - cacheBlocks[block_index].pendingWB = false; - cacheBlocks[block_index].lastChangedTick = curTick(); - DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for" - " Addr: %lu.\n", __func__, block_index, addr); - MSHR[block_index].push_back(addr); - stats.mshrEntryLength.sample(MSHR[block_index].size()); - DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " - "for cacheBlocks[%d].\n", __func__, addr, block_index); + + if (cacheBlocks[block_index].state != CacheState::INVALID) { + // conflict miss + DPRINTF(CoalesceEngine, "%s: Addr: %lu has conflict with " + "Addr: %lu.\n", __func__, addr, cacheBlocks[block_index].addr); + cacheBlocks[block_index].hasConflict = true; + if (cacheBlocks[block_index].state == CacheState::IDLE) { + if (cacheBlocks[block_index].dirty) { + cacheBlocks[block_index].state = CacheState::PENDING_WB; memoryFunctionQueue.emplace_back( [this] (int block_index, Tick schedule_tick) { - processNextRead(block_index, schedule_tick); + processNextWriteBack(block_index, schedule_tick); }, block_index, curTick()); - DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for " - "input %d to memoryFunctionQueue.\n", - __func__, block_index); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); - } - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", - __func__, block_index, - cacheBlocks[block_index].to_string()); - stats.readMisses++; - stats.numVertexReads++; - return true; + } else { + // NOTE: move the cache block to invalid state + // FIXME: Fix the issue below. + // May need to activate tracking for this + cacheBlocks[block_index].reset(); } } + // return int instead of bool to tell WLEngine to whether + // roll the first entry in the queue. + return false; } else { - DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for " - "Addr: %lu already in MSHRs. It has a conflict " - "with addr: %lu.\n", __func__, block_index, addr, - cacheBlocks[block_index].addr); - assert(MSHR[block_index].size() <= numTgtsPerMSHR); - assert(MSHR[block_index].size() > 0); - if (MSHR[block_index].size() == numTgtsPerMSHR) { - DPRINTF(CoalesceEngine, "%s: Out of targets for " - "cacheBlocks[%d]. Rejecting request.\n", - __func__, block_index); - stats.mshrTargetShortage++; + // cold miss + assert(MSHR.find(block_index) == MSHR.end()); + if (MSHR.size() < numMSHREntries) { + cacheBlocks[block_index].addr = aligned_addr; + cacheBlocks[block_index].busyMask = 0; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].dirty = false; + cacheBlocks[block_index].hasConflict = false; + cacheBlocks[block_index].needsPreWBApply = false; + cacheBlocks[block_index].state = CacheState::PENDING_DATA; + cacheBlocks[block_index].lastChangedTick = curTick(); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextRead(block_index, schedule_tick); + }, block_index, curTick()); + return true; + } else { return false; } - DPRINTF(CoalesceEngine, "%s: There is room for another target " - "for cacheBlocks[%d].\n", __func__, block_index); - - // TODO: Might want to differentiate between different misses. - stats.readMisses++; - - MSHR[block_index].push_back(addr); - stats.mshrEntryLength.sample(MSHR[block_index].size()); - DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets for " - "cacheBlocks[%d].\n", __func__, addr, block_index); - stats.numVertexReads++; - return true; } } } @@ -589,8 +421,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) schedule(nextResponseEvent, nextCycle()); } - - // TODO: Probably check for done here too. delete pkt; return true; } @@ -771,15 +601,53 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) void CoalesceEngine::processNextPreWBApplyEvent() { - int block_index = applyQueue.front(); - DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. " + int block_index = preWBApplyQueue.front(); + DPRINTF(CoalesceEngine, "%s: Looking at the front of the preWBApplyQueue. " "cacheBlock[%d] to be applied.\n", __func__, block_index); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); - assert(cacheBlocks[block_index].valid); - assert(cacheBlocks[block_index].needsApply); - assert(!cacheBlocks[block_index].pendingData); - assert(!cacheBlocks[block_index].pendingWB); + + if (cacheBlocks[block_index].state == CacheState::PENDING_PRE_WB_APPLY) { + assert(cacheBlocks[block_index].busyMask == 0); + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].needsPreWBApply); + bool block_active = false; + for (int index = 0; index < numElementsPerLine; index++) { + bool active = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]); + block_active |= active; + if (active) { + // cacheWorkCount++; + // FUTUREME: When pulling from activeCacheBlocks, in case we + // face a block that is not in idle state, we basically pop + // that entry and push it to the back. We only delete entries + // in this buffer if pushed or evicted. + activeCacheBlocks.push_back(block_index); + } + } + if (block_active && !owner->running()) { + owner->start(); + } + + cacheBlocks[block_index].needsPreWBApply = false; + if (cacheBlocks[block_index].hasConflict) { + if (cacheBlocks[block_index].dirty) { + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextWriteBack(block_index, schedule_tick); + }, block_index, curTick()); + } else { + // FIXME: Solve below issue. + // Not dirty but could be active still. + // need to activate tracking + cacheBlocks[block_index].reset(); + } + } else { + cacheBlocks[block_index].state = CacheState::IDLE; + } + cacheBlocks[block_index].lastChangedTick = curTick(); + } else { + + } if (cacheBlocks[block_index].pendingApply) { assert(cacheBlocks[block_index].busyMask == 0); @@ -883,77 +751,85 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) __func__, block_index, cacheBlocks[block_index].to_string()); // A cache block should not be touched while it's waiting for data. // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick); - + // TODO: Figure out if this is still necessary. if (cacheBlocks[block_index].lastChangedTick != schedule_tick) { return; } - assert(!cacheBlocks[block_index].valid); assert(cacheBlocks[block_index].busyMask == 0); - assert(!cacheBlocks[block_index].needsWB); - assert(!cacheBlocks[block_index].needsApply); - assert(cacheBlocks[block_index].pendingData); - assert(!cacheBlocks[block_index].pendingApply); - assert(!cacheBlocks[block_index].pendingWB); + assert(!cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].dirty); + assert(!cacheBlocks[block_index].needsPreWBApply); + assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA); bool need_send_pkt = true; + + // NOTE: Search postPushWBQueue for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++) { PacketPtr wb_pkt = std::get<0>(*wb); - if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) { + if (cacheBlocks[block_index].addr = wb_pkt->getAddr()) { wb_pkt->writeDataToBlock( (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); - cacheBlocks[block_index].needsWB = true; - for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { - Addr miss_addr = *it; - Addr aligned_miss_addr = - roundDown(miss_addr, peerMemoryAtomSize); - - if (aligned_miss_addr == cacheBlocks[block_index].addr) { - int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); - DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " - "cacheBlocks[%d] can be serviced with the received " - "packet.\n",__func__, miss_addr, block_index); - // TODO: Make this block of code into a function - responseQueue.push_back(std::make_tuple(miss_addr, - cacheBlocks[block_index].items[wl_offset], curTick())); - DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " - "to responseQueue. responseQueue.size = %d.\n", - __func__, miss_addr, - graphWorkload->printWorkListItem( - cacheBlocks[block_index].items[wl_offset]), - responseQueue.size()); - DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " - "to responseQueue. responseQueue.size = %d.\n", - __func__, miss_addr, - graphWorkload->printWorkListItem( - cacheBlocks[block_index].items[wl_offset]), - responseQueue.size()); - // TODO: Add a stat to count the number of WLItems that have been touched. - cacheBlocks[block_index].busyMask |= (1 << wl_offset); - cacheBlocks[block_index].lastChangedTick = curTick(); - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - it = MSHR[block_index].erase(it); - } else { - it++; - } - } - if (MSHR[block_index].empty()) { - MSHR.erase(block_index); - } - - if ((!nextResponseEvent.scheduled()) && - (!responseQueue.empty())) { - schedule(nextResponseEvent, nextCycle()); - } + cacheBlocks[block_index].dirty = true; + need_send_pkt = false; postPushWBQueue.erase(wb); + } + } + for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) { + PacketPtr ab_pkt = std::get<0>(*ab); + if (cacheBlocks[block_index].addr = ab_pkt->getAddr()) { + ab_pkt->writeDataToBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); need_send_pkt = false; + activeBuffer.erase(ab); } } + if (!need_send_pkt) { + cacheBlocks[block_index].valid = true; + cacheBlocks[block_index].needsPreWBApply = false; + cacheBlocks[block_index].lastChangedTick = curTick(); + for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { + Addr miss_addr = *it; + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + assert(aligned_miss_addr == cacheBlocks[block_index].addr); + int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " + "cacheBlocks[%d] can be serviced with the received " + "packet.\n",__func__, miss_addr, block_index); + // TODO: Make this block of code into a function + responseQueue.push_back(std::make_tuple(miss_addr, + cacheBlocks[block_index].items[wl_offset], curTick())); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", + __func__, block_index, + cacheBlocks[block_index].to_string()); + it = MSHR[block_index].erase(it); + } + assert(MSHR[block_index].empty()); + MSHR.erase(block_index); + if ((!nextResponseEvent.scheduled()) && + (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } + cacheBlocks[block_index].state = CacheState::BUSY; + } if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) != - pendingVertexPullReads.end()) { + pendingVertexPullReads.end()) { need_send_pkt = false; } @@ -964,11 +840,6 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); memPort.sendPacket(pkt); onTheFlyReqs++; - - if (pendingVertexPullReads.find(pkt->getAddr()) != - pendingVertexPullReads.end()) { - stats.numDoubleMemReads++; - } } } @@ -979,19 +850,27 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) __func__, block_index); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); + if (schedule_tick == cacheBlocks[block_index].lastChangedTick) { assert(cacheBlocks[block_index].valid); assert(cacheBlocks[block_index].busyMask == 0); - assert(cacheBlocks[block_index].needsWB); - assert(!cacheBlocks[block_index].needsApply); - assert(!cacheBlocks[block_index].pendingData); - assert(!cacheBlocks[block_index].pendingApply); - assert(cacheBlocks[block_index].pendingWB); - - // Why would we write it back if it does not have a conflict. - assert(MSHR.size() <= numMSHREntries); - assert(MSHR.find(block_index) != MSHR.end()); + assert(cacheBlocks[block_index].dirty); + assert(cacheBlocks[block_index].hasConflict); + assert(!cacheBlocks[block_index].needsPreWBApply); + assert(cacheBlocks[block_index].state == CacheState::PENDING_WB); + Addr base_addr = cacheBlocks[block_index].addr; + for (int index = 0; index < numElementsPerLine; index++) { + if (cacheBlocks[block_index].items[index].active) { + Addr vertex_addr = base_addr + index * sizeof(WorkListItem); + // NOTE: Implement this + // workdir.activate() + // cacheWorkCount--; + } + } + if (activeCacheBlocks.find(block_index)) { + activeCacheBlocks.erase(block_index); + } PacketPtr pkt = createWritePacket( cacheBlocks[block_index].addr, peerMemoryAtomSize, (uint8_t*) cacheBlocks[block_index].items); @@ -999,30 +878,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) "Addr: %lu, size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); memPort.sendPacket(pkt); - // onTheFlyReqs++; - cacheBlocks[block_index].needsWB = false; - cacheBlocks[block_index].pendingWB = false; - - Addr miss_addr = MSHR[block_index].front(); - Addr aligned_miss_addr = - roundDown(miss_addr, peerMemoryAtomSize); - DPRINTF(CoalesceEngine, "%s: First conflicting address for" - " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n", - __func__, block_index, miss_addr, aligned_miss_addr); - - cacheBlocks[block_index].addr = aligned_miss_addr; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].busyMask = 0; - cacheBlocks[block_index].needsWB = false; - cacheBlocks[block_index].needsApply = false; - cacheBlocks[block_index].pendingData = true; - cacheBlocks[block_index].pendingApply = false; - cacheBlocks[block_index].pendingWB = false; - cacheBlocks[block_index].lastChangedTick = curTick(); - memoryFunctionQueue.emplace_back( - [this] (int block_index, Tick schedule_tick) { - processNextRead(block_index, schedule_tick); - }, block_index, curTick()); + cacheBlocks[block_index].reset(); DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input" " %d to memoryFunctionQueue.\n", __func__, block_index); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, @@ -1049,55 +905,6 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick) } } -std::tuple -CoalesceEngine::getOptimalPullAddr() -{ - int visited_bits = 0; - int num_intial_active_bits = activeBits.size(); - while (visited_bits < num_intial_active_bits) { - int index = activeBits.front(); - int base_index = roundDown(index, numElementsPerLine); - int index_offset = index - base_index; - assert(needsPush[index] == 1); - assert(index_offset < numElementsPerLine); - - Addr addr = getBlockAddrFromBitIndex(base_index); - int block_index = getBlockIndex(addr); - if (pendingVertexPullReads.find(addr) != pendingVertexPullReads.end()) - { - uint64_t send_mask = pendingVertexPullReads[addr]; - uint64_t vertex_send_mask = send_mask & (1 << index_offset); - assert(vertex_send_mask == 0); - activeBits.pop_front(); - return std::make_tuple( - WorkLocation::PENDING_READ, addr, index_offset); - } else { - // Only if it is in cache and it is in idle state. - if ((cacheBlocks[block_index].addr == addr) && - (cacheBlocks[block_index].valid) && - (cacheBlocks[block_index].busyMask == 0) && - (!cacheBlocks[block_index].pendingApply) && - (!cacheBlocks[block_index].pendingWB)) { - assert(!cacheBlocks[block_index].needsApply); - assert(!cacheBlocks[block_index].pendingData); - activeBits.pop_front(); - return std::make_tuple( - WorkLocation::IN_CACHE, block_index, index_offset); - // Otherwise if it is in memory - } else if ((cacheBlocks[block_index].addr != addr)) { - activeBits.pop_front(); - return std::make_tuple( - WorkLocation::IN_MEMORY, addr, index_offset); - } - } - activeBits.pop_front(); - activeBits.push_back(index); - visited_bits++; - } - - return std::make_tuple(WorkLocation::GARBAGE, 0, 0); -} - void CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) { @@ -1262,8 +1069,6 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) ADD_STAT(vertexPushBW, statistics::units::Rate::get(), "Rate at which vertices are pushed."), - ADD_STAT(mshrEntryLength, statistics::units::Count::get(), - "Histogram on the length of the mshr entries."), ADD_STAT(bitvectorLength, statistics::units::Count::get(), "Histogram of the length of the bitvector."), ADD_STAT(responseQueueLatency, statistics::units::Second::get(), diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 05e268270a..8da67c7b43 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -60,9 +60,26 @@ enum CacheState IDLE, PENDING_PRE_WB_APPLY, PENDING_WB, + PENDING_PRE_PUSH_APPLY, NUM_CACHE_STATE }; +const char* cacheStateStrings[NUM_CACHE_STATE] = { + "INVALID", + "PENDING_DATA", + "BUSY", + "IDLE", + "PENDING_PRE_WB_APPLY", + "PENDING_WB", + "PENDING_PRE_PUSH_APPLY" +}; + +enum ReadDestination +{ + READ_FOR_CACHE, + READ_FOR_PUSH +}; + class MPU; @@ -71,7 +88,6 @@ class MPU; class WorkDirectory { private: - CoalesceEngine* owner; Addr memoryAtomSize; int atomBlockSize; size_t elementSize; @@ -88,7 +104,6 @@ class WorkDirectory void deactivate(Addr addr); int workCount(); std::tuple getNextWork(); - }; class CoalesceEngine : public BaseMemoryEngine @@ -100,47 +115,54 @@ class CoalesceEngine : public BaseMemoryEngine Addr addr; uint64_t busyMask; bool valid; - bool needsApply; - bool needsWB; - bool pendingData; - bool pendingApply; - bool pendingWB; - Tick lastChangedTick; + bool dirty; + bool hasConflict; + bool needsPreWBApply; CacheState state; - // TODO: This might be useful in the future - // Tick lastWLWriteTick; + Tick lastChangedTick; Block() {} Block(int num_elements): addr(-1), busyMask(0), valid(false), - needsApply(false), - needsWB(false), - pendingData(false), - pendingApply(false), - pendingWB(false), - lastChangedTick(0), - state(CacheState::INVALID) + dirty(false), + hasConflict(false), + needsPreWBApply(false), + state(CacheState::INVALID), + lastChangedTick(0) { items = new WorkListItem [num_elements]; } + void reset() { + addr = -1; + busyMask = 0; + valid = false; + dirty = false; + hasConflict = false; + needsPreWBApply = false; + state = CacheState::INVALID; + lastChangedTick = 0; + } + std::string to_string() { return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, " - "needsApply: %s, needsWB: %s, pendingData: %s, " - "pendingApply: %s, pendingWB: %s, lastChangedTick: %lu}", - addr, busyMask, valid ? "true" : "false", - needsApply ? "true" : "false", needsWB ? "true" : "false", - pendingData ? "true" : "false", pendingApply ? "true" : "false", - pendingWB ? "true" : "false", lastChangedTick); + "dirty: %s, hasConflict: %s, needsPreWBApply: %s" + "state: %s, lastChangedTick: %lu}", addr, busyMask, + valid ? "true" : "false", dirty ? "true" : "false", + hasConflict ? "true" : "false", + needsPreWBApply ? "true" : "false", + cacheStateStrings[state], lastChangedTick); } }; - struct SenderState : public Packet::SenderState + struct ReadPurpose : public Packet::SenderState { - bool isRetry; - SenderState(bool is_retry): isRetry(is_retry) {} + ReadDestination _dest; + ReadPurpose(ReadDestination dest): _dest(dest) {} + ReadDestination dest() { return _dest; } }; + MPU* owner; GraphWorkload* graphWorkload; @@ -150,28 +172,33 @@ class CoalesceEngine : public BaseMemoryEngine int onTheFlyReqs; int numMSHREntries; - int numTgtsPerMSHR; std::unordered_map> MSHR; + + // Response route to WLEngine int maxRespPerCycle; std::deque> responseQueue; - int _workCount; + // Tracking work in cache + int cacheWorkCount; int numPullsReceived; - UniqueFIFO applyQueue; - std::bitset needsPush; - std::deque activeBits; + UniqueFIFO preWBApplyQueue; + // NOTE: Remember to erase from this upon eviction from cache + UniqueFIFO activeCacheBlocks; + + int pendingPullReads; + // A map from addr to sendMask. sendMask determines which bytes to + // send for push when getting the read response from memory. + std::unordered_map pendingVertexPullReads; + + int activeBufferSize; int postPushWBQueueSize; + std::deque> activeBuffer; std::deque> postPushWBQueue; int getBlockIndex(Addr addr); + // TODO: Should be moved to WorkDirectory int getBitIndexBase(Addr addr); Addr getBlockAddrFromBitIndex(int index); - std::tuple getOptimalPullAddr(); - - int maxPotentialPostPushWB; - // A map from addr to sendMask. sendMask determines which bytes to - // send for push when getting the read response from memory. - std::unordered_map pendingVertexPullReads; MemoryEvent nextMemoryEvent; void processNextMemoryEvent(); @@ -188,6 +215,9 @@ class CoalesceEngine : public BaseMemoryEngine EventFunctionWrapper nextPreWBApplyEvent; void processNextPreWBApplyEvent(); + EventFunctionWrapper nextPrePushApplyEvent; + void processNextPrePushApplyEvent(); + struct CoalesceStats : public statistics::Group { CoalesceStats(CoalesceEngine &coalesce); @@ -223,7 +253,6 @@ class CoalesceEngine : public BaseMemoryEngine statistics::Formula vertexPullBW; statistics::Formula vertexPushBW; - statistics::Histogram mshrEntryLength; statistics::Histogram bitvectorLength; statistics::Histogram responseQueueLatency; statistics::Histogram memoryFunctionLatency; @@ -246,6 +275,8 @@ class CoalesceEngine : public BaseMemoryEngine bool recvWLRead(Addr addr); void recvWLWrite(Addr addr, WorkListItem wl); + // FIXME: Update this to return sum of cacheWorkCount and WorkDirectory + // workcount. int workCount() { return _workCount; } void recvVertexPull(); diff --git a/src/accl/graph/sega/coalesce_engine_bak.cc b/src/accl/graph/sega/coalesce_engine_bak.cc new file mode 100644 index 0000000000..7a064c1c2f --- /dev/null +++ b/src/accl/graph/sega/coalesce_engine_bak.cc @@ -0,0 +1,1308 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/coalesce_engine.hh" + +#include + +#include "accl/graph/sega/mpu.hh" +#include "base/intmath.hh" +#include "debug/CacheBlockState.hh" +#include "debug/CoalesceEngine.hh" +#include "debug/SEGAStructureSize.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +CoalesceEngine::CoalesceEngine(const Params ¶ms): + BaseMemoryEngine(params), + numLines((int) (params.cache_size / peerMemoryAtomSize)), + numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), + onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry), + numTgtsPerMSHR(params.num_tgts_per_mshr), + maxRespPerCycle(params.max_resp_per_cycle), _workCount(0), + numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size), + maxPotentialPostPushWB(0), + nextMemoryEvent([this] { + processNextMemoryEvent(); + }, name() + ".nextMemoryEvent"), + nextResponseEvent([this] { + processNextResponseEvent(); + }, name() + ".nextResponseEvent"), + nextPreWBApplyEvent([this] { + processNextPreWBApplyEvent(); + }, name() + ".nextPreWBApplyEvent"), + stats(*this) +{ + assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine)); + cacheBlocks = new Block [numLines]; + for (int i = 0; i < numLines; i++) { + cacheBlocks[i] = Block(numElementsPerLine); + } + needsPush.reset(); +} + +void +CoalesceEngine::registerMPU(MPU* mpu) +{ + owner = mpu; +} + +void +CoalesceEngine::recvFunctional(PacketPtr pkt) +{ + if (pkt->isRead()) { + assert(pkt->getSize() == peerMemoryAtomSize); + Addr addr = pkt->getAddr(); + int block_index = getBlockIndex(addr); + + if ((cacheBlocks[block_index].addr == addr) && + (cacheBlocks[block_index].valid)) { + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].needsApply); + // NOTE: No need to check needsWB because there might be entries + // that have been updated and not written back in the cache. + // assert(!cacheBlocks[block_index].needsWB); + assert(!cacheBlocks[block_index].pendingApply); + assert(!cacheBlocks[block_index].pendingWB); + + pkt->makeResponse(); + pkt->setDataFromBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + } else { + memPort.sendFunctional(pkt); + } + } else { + // TODO: Add and implement init function for GraphWorkload. + int bit_index_base = getBitIndexBase(pkt->getAddr()); + graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount); + memPort.sendFunctional(pkt); + } +} + +bool +CoalesceEngine::done() +{ + return applyQueue.empty() && needsPush.none() && + memoryFunctionQueue.empty() && (onTheFlyReqs == 0); +} + +// addr should be aligned to peerMemoryAtomSize +int +CoalesceEngine::getBlockIndex(Addr addr) +{ + assert((addr % peerMemoryAtomSize) == 0); + Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); + return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines; +} + +// addr should be aligned to peerMemoryAtomSize +int +CoalesceEngine::getBitIndexBase(Addr addr) +{ + assert((addr % peerMemoryAtomSize) == 0); + Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); + int atom_index = (int) (trimmed_addr / peerMemoryAtomSize); + int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem)); + return atom_index * block_bits; +} + +// index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem)) +Addr +CoalesceEngine::getBlockAddrFromBitIndex(int index) +{ + assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0); + Addr trimmed_addr = index * sizeof(WorkListItem); + return peerMemoryRange.addIntlvBits(trimmed_addr); +} + +bool +CoalesceEngine::recvWLRead(Addr addr) +{ + Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); + assert(aligned_addr % peerMemoryAtomSize == 0); + int block_index = getBlockIndex(aligned_addr); + assert(block_index < numLines); + int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); + assert(wl_offset < numElementsPerLine); + DPRINTF(CoalesceEngine, "%s: Received a read request for addr: %lu. " + "This request maps to cacheBlocks[%d], aligned_addr: " + "%lu, and wl_offset: %d.\n", __func__, addr, + block_index, aligned_addr, wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if ((cacheBlocks[block_index].addr == aligned_addr) && + (cacheBlocks[block_index].valid)) { + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit.\n", __func__, addr); + stats.readHits++; + assert(!cacheBlocks[block_index].pendingData); + // No cache block could be in pendingApply and pendingWB at the + // same time. + assert(!(cacheBlocks[block_index].pendingApply && + cacheBlocks[block_index].pendingWB)); + // Hit + // TODO: Add a hit latency as a param for this object. + // Can't just schedule the nextResponseEvent for latency cycles in + // the future. + responseQueue.push_back(std::make_tuple( + addr, cacheBlocks[block_index].items[wl_offset], curTick())); + + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + // TODO: Stat to count the number of WLItems that have been touched. + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + // If they are scheduled for apply and WB those schedules should be + // discarded. Since there is no easy way to take items out of the + // function queue. Those functions check for their respective bits + // and skip the process if the respective bit is set to false. + cacheBlocks[block_index].pendingApply = false; + cacheBlocks[block_index].pendingWB = false; + // HACK: If a read happens on the same cycle as another operation such + // as apply set lastChangedTick to half a cycle later so that operation + // scheduled by the original operation (apply in this example) are + // invalidated. For more details refer to "accl/graph/sega/busyMaskErr" + cacheBlocks[block_index].lastChangedTick = + curTick() + (Tick) (clockPeriod() / 2); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if (!nextResponseEvent.scheduled()) { + schedule(nextResponseEvent, nextCycle()); + } + stats.numVertexReads++; + return true; + } else if ((cacheBlocks[block_index].addr == aligned_addr) && + (cacheBlocks[block_index].pendingData)) { + // Hit under miss + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit under miss.\n", + __func__, addr); + stats.readHitUnderMisses++; + assert(!cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].needsWB); + assert(!cacheBlocks[block_index].needsApply); + assert(!cacheBlocks[block_index].pendingApply); + assert(!cacheBlocks[block_index].pendingWB); + + assert(MSHR.size() <= numMSHREntries); + assert(MSHR.find(block_index) != MSHR.end()); + assert(MSHR[block_index].size() <= numTgtsPerMSHR); + if (MSHR[block_index].size() == numTgtsPerMSHR) { + DPRINTF(CoalesceEngine, "%s: Out of targets for " + "cacheBlocks[%d]. Rejecting request.\n", + __func__, block_index); + stats.mshrTargetShortage++; + return false; + } else { + DPRINTF(CoalesceEngine, "%s: MSHR entries are available for " + "cacheBlocks[%d].\n", __func__, block_index); + } + MSHR[block_index].push_back(addr); + stats.mshrEntryLength.sample(MSHR[block_index].size()); + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " + "for cacheBlocks[%d].\n", __func__, addr, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + stats.numVertexReads++; + return true; + } else { + // miss + // FIXME: Make this assert work. It will break if the cache block + // is cold and addr or aligned_addr is 0. It fails because cache block + // addr field is initialized to 0. Unfortunately Addr type is unsigned. + // So you can not initialized addr to -1. + assert(cacheBlocks[block_index].addr != aligned_addr); + assert(MSHR.size() <= numMSHREntries); + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a miss.\n", __func__, addr); + if (MSHR.find(block_index) == MSHR.end()) { + DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for Addr:" + " %lu not found in MSHRs.\n", __func__, block_index, addr); + if (MSHR.size() == numMSHREntries) { + // Out of MSHR entries + DPRINTF(CoalesceEngine, "%s: Out of MSHR entries. " + "Rejecting request.\n", __func__); + // TODO: Break out read rejections into more than one stat + // based on the cause of the rejection + stats.mshrEntryShortage++; + return false; + } else { + DPRINTF(CoalesceEngine, "%s: MSHR " + "entries available.\n", __func__); + if ((cacheBlocks[block_index].valid) || + (cacheBlocks[block_index].pendingData)) { + DPRINTF(CoalesceEngine, "%s: Addr: %lu has a conflict " + "with Addr: %lu.\n", __func__, addr, + cacheBlocks[block_index].addr); + if ((cacheBlocks[block_index].valid) && + (cacheBlocks[block_index].busyMask == 0) && + (!cacheBlocks[block_index].pendingApply) && + (!cacheBlocks[block_index].pendingWB)) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in " + "idle state.\n", __func__, block_index); + // We're in idle state + // Idle: valid && !pendingApply && !pendingWB; + // Note 0: needsApply has to be false. Because + // A cache line enters the idle state from two + // other states. First a busy state that does not + // need apply (needsApply is already false) or + // from pendingApplyState after being applied which + // clears the needsApply bit. needsApply is useful + // when a cache block has transitioned from + // pendingApply to busy without the apply happening. + // Note 1: pendingData does not have to be evaluated + // becuase pendingData is cleared when data + // arrives from the memory and valid does not + // denote cleanliness of the line. Rather it + // is used to differentiate between empty blocks + // and the blocks that have data from memory. + // pendingData denotes the transient state between + // getting a miss and getting the data for that miss. + // valid basically means that the data in the cache + // could be used to respond to read/write requests. + assert(!cacheBlocks[block_index].needsApply); + assert(!cacheBlocks[block_index].pendingData); + // There are no conflicts in idle state. + assert(MSHR.find(block_index) == MSHR.end()); + if (cacheBlocks[block_index].needsWB) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs" + "to be written back.\n", __func__, block_index); + cacheBlocks[block_index].pendingWB = true; + cacheBlocks[block_index].lastChangedTick = curTick(); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextWriteBack(block_index, schedule_tick); + }, block_index, curTick()); + DPRINTF(CoalesceEngine, "%s: Pushed " + "processNextWriteBack for input " + "%d to memoryFunctionQueue.\n", + __func__, block_index); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: " + "%s.\n", __func__, block_index, + cacheBlocks[block_index].to_string()); + } else { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does " + "not need to be written back.\n", + __func__, block_index); + cacheBlocks[block_index].addr = aligned_addr; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].busyMask = 0; + cacheBlocks[block_index].needsWB = false; + cacheBlocks[block_index].needsApply = false; + cacheBlocks[block_index].pendingData = true; + cacheBlocks[block_index].pendingApply = false; + cacheBlocks[block_index].pendingWB = false; + cacheBlocks[block_index].lastChangedTick = curTick(); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextRead(block_index, schedule_tick); + }, block_index, curTick()); + DPRINTF(CoalesceEngine, "%s: Pushed " + "processNextRead for input " + "%d to memoryFunctionQueue.\n", + __func__, block_index); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: " + "%s.\n", __func__, block_index, + cacheBlocks[block_index].to_string()); + } + } + // cacheBlocks[block_index].hasConflict = true; + MSHR[block_index].push_back(addr); + stats.mshrEntryLength.sample(MSHR[block_index].size()); + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " + "for cacheBlocks[%d].\n", __func__, addr, block_index); + stats.readMisses++; + // TODO: Add readConflicts here. + stats.numVertexReads++; + return true; + } else { + // MSHR available and no conflict + DPRINTF(CoalesceEngine, "%s: Addr: %lu has no conflict. " + "Allocating a cache line for it.\n" + , __func__, addr); + assert(!cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].needsWB); + assert(!cacheBlocks[block_index].needsApply); + assert(!cacheBlocks[block_index].pendingData); + assert(!cacheBlocks[block_index].pendingApply); + assert(!cacheBlocks[block_index].pendingWB); + assert(MSHR[block_index].size() == 0); + + cacheBlocks[block_index].addr = aligned_addr; + cacheBlocks[block_index].busyMask = 0; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].needsWB = false; + cacheBlocks[block_index].needsApply = false; + cacheBlocks[block_index].pendingData = true; + cacheBlocks[block_index].pendingApply = false; + cacheBlocks[block_index].pendingWB = false; + cacheBlocks[block_index].lastChangedTick = curTick(); + DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for" + " Addr: %lu.\n", __func__, block_index, addr); + MSHR[block_index].push_back(addr); + stats.mshrEntryLength.sample(MSHR[block_index].size()); + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " + "for cacheBlocks[%d].\n", __func__, addr, block_index); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextRead(block_index, schedule_tick); + }, block_index, curTick()); + DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for " + "input %d to memoryFunctionQueue.\n", + __func__, block_index); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", + __func__, block_index, + cacheBlocks[block_index].to_string()); + stats.readMisses++; + stats.numVertexReads++; + return true; + } + } + } else { + DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for " + "Addr: %lu already in MSHRs. It has a conflict " + "with addr: %lu.\n", __func__, block_index, addr, + cacheBlocks[block_index].addr); + assert(MSHR[block_index].size() <= numTgtsPerMSHR); + assert(MSHR[block_index].size() > 0); + if (MSHR[block_index].size() == numTgtsPerMSHR) { + DPRINTF(CoalesceEngine, "%s: Out of targets for " + "cacheBlocks[%d]. Rejecting request.\n", + __func__, block_index); + stats.mshrTargetShortage++; + return false; + } + DPRINTF(CoalesceEngine, "%s: There is room for another target " + "for cacheBlocks[%d].\n", __func__, block_index); + + // TODO: Might want to differentiate between different misses. + stats.readMisses++; + + MSHR[block_index].push_back(addr); + stats.mshrEntryLength.sample(MSHR[block_index].size()); + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets for " + "cacheBlocks[%d].\n", __func__, addr, block_index); + stats.numVertexReads++; + return true; + } + } +} + +bool +CoalesceEngine::handleMemResp(PacketPtr pkt) +{ + assert(pkt->isResponse()); + DPRINTF(CoalesceEngine, "%s: Received packet: %s from memory.\n", + __func__, pkt->print()); + if (pkt->isWrite()) { + DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__); + delete pkt; + return true; + } + + onTheFlyReqs--; + Addr addr = pkt->getAddr(); + int block_index = getBlockIndex(addr); + WorkListItem* items = pkt->getPtr(); + + bool do_wb = false; + if (pkt->findNextSenderState()) { + assert(!((cacheBlocks[block_index].addr == addr) && + (cacheBlocks[block_index].valid))); + // We have read the address to send the wl and it is not in the + // cache. Simply send the items to the PushEngine. + + DPRINTF(CoalesceEngine, "%s: Received read response for pull read " + "for addr %lu.\n", __func__, addr); + int it = getBitIndexBase(addr); + uint64_t send_mask = pendingVertexPullReads[addr]; + // No applying of the line needed. + for (int i = 0; i < numElementsPerLine; i++) { + Addr vertex_addr = addr + i * sizeof(WorkListItem); + uint64_t vertex_send_mask = send_mask & (1 << i); + if (vertex_send_mask != 0) { + assert(needsPush[it + i] == 1); + needsPush[it + i] = 0; + _workCount--; + + uint32_t delta; + bool do_push, do_wb_v; + std::tie(delta, do_push, do_wb_v) = + graphWorkload->prePushApply(items[i]); + do_wb |= do_wb_v; + if (do_push) { + owner->recvVertexPush(vertex_addr, delta, + items[i].edgeIndex, items[i].degree); + } else { + // TODO: Add a stat to count this. + owner->recvPrevPullCorrection(); + } + stats.verticesPushed++; + stats.lastVertexPushTime = curTick() - stats.lastResetTick; + } + } + pendingVertexPullReads.erase(addr); + maxPotentialPostPushWB--; + } + + bool cache_wb = false; + if (cacheBlocks[block_index].addr == addr) { + DPRINTF(CoalesceEngine, "%s: Received read response to " + "fill cacheBlocks[%d].\n", __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + assert(!cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].needsWB); + assert(!cacheBlocks[block_index].needsApply); + assert(cacheBlocks[block_index].pendingData); + assert(!cacheBlocks[block_index].pendingApply); + assert(!cacheBlocks[block_index].pendingWB); + assert(MSHR.find(block_index) != MSHR.end()); + std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize); + for (int i = 0; i < numElementsPerLine; i++) { + DPRINTF(CoalesceEngine, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", + __func__, block_index, i, graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[i])); + } + cacheBlocks[block_index].valid = true; + cacheBlocks[block_index].needsWB |= do_wb; + cacheBlocks[block_index].pendingData = false; + // HACK: In case processNextRead is called on the same tick as curTick + // and is scheduled to read to the same cacheBlocks[block_index] + cacheBlocks[block_index].lastChangedTick = + curTick() - (Tick) (clockPeriod() / 2); + cache_wb = true; + } else if (do_wb) { + PacketPtr wb_pkt = createWritePacket( + addr, peerMemoryAtomSize, (uint8_t*) items); + postPushWBQueue.emplace_back(wb_pkt, curTick()); + memoryFunctionQueue.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextPostPushWB(ignore, schedule_tick); + }, 0, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + } else { + // TODO: Add a stat to count this. + // FIXME: This is not a totally wasteful read. e.g. all reads + // for pull in BFS are like this. + DPRINTF(CoalesceEngine, "%s: No write destination for addr: %lu.\n", __func__, addr); + } + + if (cache_wb) { + for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { + Addr miss_addr = *it; + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + + if (aligned_miss_addr == addr) { + int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " + "cacheBlocks[%d] can be serviced with the received " + "packet.\n",__func__, miss_addr, block_index); + // TODO: Make this block of code into a function + responseQueue.push_back(std::make_tuple(miss_addr, + cacheBlocks[block_index].items[wl_offset], curTick())); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + // TODO: Add a stat to count the number of WLItems that have been touched. + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + // cacheBlocks[block_index].lastChangedTick = curTick(); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + it = MSHR[block_index].erase(it); + } else { + it++; + } + } + } + + if (MSHR[block_index].empty()) { + MSHR.erase(block_index); + } + + if ((!nextResponseEvent.scheduled()) && + (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } + + + // TODO: Probably check for done here too. + delete pkt; + return true; +} + +// TODO: For loop to empty the entire responseQueue. +void +CoalesceEngine::processNextResponseEvent() +{ + int num_responses_sent = 0; + + Addr addr_response; + WorkListItem worklist_response; + Tick response_queueing_tick; + while(true) { + std::tie(addr_response, worklist_response, response_queueing_tick) = + responseQueue.front(); + Tick waiting_ticks = curTick() - response_queueing_tick; + if (ticksToCycles(waiting_ticks) < 1) { + break; + } + owner->handleIncomingWL(addr_response, worklist_response); + num_responses_sent++; + DPRINTF(CoalesceEngine, + "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n", + __func__, + graphWorkload->printWorkListItem(worklist_response), + addr_response); + + responseQueue.pop_front(); + DPRINTF(SEGAStructureSize, "%s: Popped a response from responseQueue. " + "responseQueue.size = %d.\n", __func__, + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Popped a response from responseQueue. " + "responseQueue.size = %d.\n", __func__, + responseQueue.size()); + stats.responseQueueLatency.sample( + waiting_ticks * 1e9 / getClockFrequency()); + if (num_responses_sent >= maxRespPerCycle) { + if (!responseQueue.empty()) { + stats.responsePortShortage++; + } + break; + } + if (responseQueue.empty()) { + break; + } + } + + if ((!nextResponseEvent.scheduled()) && + (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } +} + +void +CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) +{ + Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); + int block_index = getBlockIndex(aligned_addr); + int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Received a write request for addr: %lu with " + "wl: %s. This request maps to cacheBlocks[%d], " + "aligned_addr: %lu, and wl_offset: %d.\n", + __func__, addr, graphWorkload->printWorkListItem(wl), + block_index, aligned_addr, wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + DPRINTF(CoalesceEngine, "%s: Received a write for WorkListItem: %s " + "with Addr: %lu.\n", __func__, + graphWorkload->printWorkListItem(wl), addr); + // Desing does not allow for write misses for now. + assert(cacheBlocks[block_index].addr == aligned_addr); + // cache state asserts + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].busyMask != 0); + assert(!cacheBlocks[block_index].pendingData); + assert(!cacheBlocks[block_index].pendingApply); + assert(!cacheBlocks[block_index].pendingWB); + + // respective bit in busyMask for wl is set. + assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) == + (1 << wl_offset)); + + if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) { + cacheBlocks[block_index].needsWB |= true; + stats.numVertexWrites++; + } + cacheBlocks[block_index].items[wl_offset] = wl; + if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) { + cacheBlocks[block_index].needsApply |= true; + cacheBlocks[block_index].needsWB |= true; + } + + cacheBlocks[block_index].busyMask &= ~(1 << wl_offset); + cacheBlocks[block_index].lastChangedTick = curTick(); + DPRINTF(CoalesceEngine, "%s: Wrote to cacheBlocks[%d][%d] = %s.\n", + __func__, block_index, wl_offset, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset])); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + // TODO: Make this more general and programmable. + if ((cacheBlocks[block_index].busyMask == 0)) { + if (cacheBlocks[block_index].needsApply) { + cacheBlocks[block_index].pendingApply = true; + cacheBlocks[block_index].lastChangedTick = curTick(); + applyQueue.push_back(block_index); + DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to " + "applyQueue.\n", __func__, block_index); + if ((!applyQueue.empty()) && + (!nextPreWBApplyEvent.scheduled())) { + schedule(nextPreWBApplyEvent, nextCycle()); + } + } else { + assert(MSHR.size() <= numMSHREntries); + // cache line has conflict. + if (MSHR.find(block_index) != MSHR.end()) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending " + "conflict.\n", __func__, block_index); + if (cacheBlocks[block_index].needsWB) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write" + " back.\n", __func__, block_index); + cacheBlocks[block_index].pendingWB = true; + cacheBlocks[block_index].lastChangedTick = curTick(); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextWriteBack(block_index, schedule_tick); + }, block_index, curTick()); + DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack " + "for input %d to memoryFunctionQueue.\n", + __func__, block_index); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + } else { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does not need" + " a write back.\n", __func__, block_index); + Addr miss_addr = MSHR[block_index].front(); + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + DPRINTF(CoalesceEngine, "%s: First conflicting address for" + " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n", + __func__, block_index, miss_addr, aligned_miss_addr); + cacheBlocks[block_index].addr = aligned_miss_addr; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].busyMask = 0; + cacheBlocks[block_index].needsWB = false; + cacheBlocks[block_index].needsApply = false; + cacheBlocks[block_index].pendingData = true; + cacheBlocks[block_index].pendingApply = false; + cacheBlocks[block_index].pendingWB = false; + cacheBlocks[block_index].lastChangedTick = curTick(); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextRead(block_index, schedule_tick); + }, block_index, curTick()); + DPRINTF(CoalesceEngine, "%s: Pushed processNextRead " + "for input %d to memoryFunctionQueue.\n", + __func__, block_index); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + } + } else { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in " + "idle state now.\n", __func__, block_index); + } + } + } + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + +} + +void +CoalesceEngine::processNextPreWBApplyEvent() +{ + int block_index = applyQueue.front(); + DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. " + "cacheBlock[%d] to be applied.\n", __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", + __func__, block_index, cacheBlocks[block_index].to_string()); + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].needsApply); + assert(!cacheBlocks[block_index].pendingData); + assert(!cacheBlocks[block_index].pendingWB); + + if (cacheBlocks[block_index].pendingApply) { + assert(cacheBlocks[block_index].busyMask == 0); + for (int index = 0; index < numElementsPerLine; index++) { + bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]); + if (do_push) { + int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr); + if (needsPush[bit_index_base + index] == 0) { + needsPush[bit_index_base + index] = 1; + _workCount++; + activeBits.push_back(bit_index_base + index); + if (!owner->running()) { + owner->start(); + } + } + } + } + stats.bitvectorLength.sample(needsPush.count()); + + assert(cacheBlocks[block_index].needsWB); + cacheBlocks[block_index].needsApply = false; + cacheBlocks[block_index].pendingApply = false; + cacheBlocks[block_index].lastChangedTick = curTick(); + + assert(MSHR.size() <= numMSHREntries); + if (MSHR.find(block_index) != MSHR.end()) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending " + "conflicts.\n", __func__, block_index); + cacheBlocks[block_index].pendingWB = true; + cacheBlocks[block_index].lastChangedTick = curTick(); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextWriteBack(block_index, schedule_tick); + }, block_index, curTick()); + DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input" + " %d to memoryFunctionQueue.\n", __func__, block_index); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + } else { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in " + "idle state now.\n", __func__, block_index); + } + DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + } else { + stats.numInvalidApplies++; + } + + applyQueue.pop_front(); + if ((!applyQueue.empty()) && + (!nextPreWBApplyEvent.scheduled())) { + schedule(nextPreWBApplyEvent, nextCycle()); + } + + if (done()) { + owner->recvDoneSignal(); + } +} + +void +CoalesceEngine::processNextMemoryEvent() +{ + if (memPort.blocked()) { + stats.numMemoryBlocks++; + nextMemoryEvent.sleep(); + return; + } + + DPRINTF(CoalesceEngine, "%s: Processing another " + "memory function.\n", __func__); + std::function next_memory_function; + int next_memory_function_input; + Tick next_memory_function_tick; + std::tie( + next_memory_function, + next_memory_function_input, + next_memory_function_tick) = memoryFunctionQueue.front(); + next_memory_function(next_memory_function_input, next_memory_function_tick); + memoryFunctionQueue.pop_front(); + stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick) + * 1e9 / getClockFrequency()); + DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. " + "memoryFunctionQueue.size = %d.\n", __func__, + memoryFunctionQueue.size()); + + assert(!nextMemoryEvent.pending()); + assert(!nextMemoryEvent.scheduled()); + if ((!memoryFunctionQueue.empty())) { + schedule(nextMemoryEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) +{ + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n", + __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", + __func__, block_index, cacheBlocks[block_index].to_string()); + // A cache block should not be touched while it's waiting for data. + // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick); + + if (cacheBlocks[block_index].lastChangedTick != schedule_tick) { + return; + } + + assert(!cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].needsWB); + assert(!cacheBlocks[block_index].needsApply); + assert(cacheBlocks[block_index].pendingData); + assert(!cacheBlocks[block_index].pendingApply); + assert(!cacheBlocks[block_index].pendingWB); + + bool need_send_pkt = true; + for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++) + { + PacketPtr wb_pkt = std::get<0>(*wb); + if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) { + wb_pkt->writeDataToBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + cacheBlocks[block_index].needsWB = true; + for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { + Addr miss_addr = *it; + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + + if (aligned_miss_addr == cacheBlocks[block_index].addr) { + int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " + "cacheBlocks[%d] can be serviced with the received " + "packet.\n",__func__, miss_addr, block_index); + // TODO: Make this block of code into a function + responseQueue.push_back(std::make_tuple(miss_addr, + cacheBlocks[block_index].items[wl_offset], curTick())); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + // TODO: Add a stat to count the number of WLItems that have been touched. + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + cacheBlocks[block_index].lastChangedTick = curTick(); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + it = MSHR[block_index].erase(it); + } else { + it++; + } + } + if (MSHR[block_index].empty()) { + MSHR.erase(block_index); + } + + if ((!nextResponseEvent.scheduled()) && + (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } + postPushWBQueue.erase(wb); + need_send_pkt = false; + } + } + + if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) != + pendingVertexPullReads.end()) { + need_send_pkt = false; + } + + if (need_send_pkt) { + PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr, + peerMemoryAtomSize); + DPRINTF(CoalesceEngine, "%s: Created a read packet. addr = %lu, " + "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); + memPort.sendPacket(pkt); + onTheFlyReqs++; + + if (pendingVertexPullReads.find(pkt->getAddr()) != + pendingVertexPullReads.end()) { + stats.numDoubleMemReads++; + } + } +} + +void +CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) +{ + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n", + __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + if (schedule_tick == cacheBlocks[block_index].lastChangedTick) { + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].busyMask == 0); + assert(cacheBlocks[block_index].needsWB); + assert(!cacheBlocks[block_index].needsApply); + assert(!cacheBlocks[block_index].pendingData); + assert(!cacheBlocks[block_index].pendingApply); + assert(cacheBlocks[block_index].pendingWB); + + // Why would we write it back if it does not have a conflict. + assert(MSHR.size() <= numMSHREntries); + assert(MSHR.find(block_index) != MSHR.end()); + + PacketPtr pkt = createWritePacket( + cacheBlocks[block_index].addr, peerMemoryAtomSize, + (uint8_t*) cacheBlocks[block_index].items); + DPRINTF(CoalesceEngine, "%s: Created a write packet to " + "Addr: %lu, size = %d.\n", __func__, + pkt->getAddr(), pkt->getSize()); + memPort.sendPacket(pkt); + // onTheFlyReqs++; + cacheBlocks[block_index].needsWB = false; + cacheBlocks[block_index].pendingWB = false; + + Addr miss_addr = MSHR[block_index].front(); + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + DPRINTF(CoalesceEngine, "%s: First conflicting address for" + " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n", + __func__, block_index, miss_addr, aligned_miss_addr); + + cacheBlocks[block_index].addr = aligned_miss_addr; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].busyMask = 0; + cacheBlocks[block_index].needsWB = false; + cacheBlocks[block_index].needsApply = false; + cacheBlocks[block_index].pendingData = true; + cacheBlocks[block_index].pendingApply = false; + cacheBlocks[block_index].pendingWB = false; + cacheBlocks[block_index].lastChangedTick = curTick(); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextRead(block_index, schedule_tick); + }, block_index, curTick()); + DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input" + " %d to memoryFunctionQueue.\n", __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + } else { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a " + "write back has been scheduled for it. Ignoring " + "the current write back scheduled at tick %lu for " + "the right function scheduled later.\n", + __func__, block_index, schedule_tick); + stats.numInvalidWriteBacks++; + } +} + +void +CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick) +{ + PacketPtr wb_pkt; + Tick pkt_tick; + std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front(); + if (schedule_tick == pkt_tick) { + memPort.sendPacket(wb_pkt); + postPushWBQueue.pop_front(); + } +} + +std::tuple +CoalesceEngine::getOptimalPullAddr() +{ + int visited_bits = 0; + int num_intial_active_bits = activeBits.size(); + while (visited_bits < num_intial_active_bits) { + int index = activeBits.front(); + int base_index = roundDown(index, numElementsPerLine); + int index_offset = index - base_index; + assert(needsPush[index] == 1); + assert(index_offset < numElementsPerLine); + + Addr addr = getBlockAddrFromBitIndex(base_index); + int block_index = getBlockIndex(addr); + if (pendingVertexPullReads.find(addr) != pendingVertexPullReads.end()) + { + uint64_t send_mask = pendingVertexPullReads[addr]; + uint64_t vertex_send_mask = send_mask & (1 << index_offset); + assert(vertex_send_mask == 0); + activeBits.pop_front(); + return std::make_tuple( + WorkLocation::PENDING_READ, addr, index_offset); + } else { + // Only if it is in cache and it is in idle state. + if ((cacheBlocks[block_index].addr == addr) && + (cacheBlocks[block_index].valid) && + (cacheBlocks[block_index].busyMask == 0) && + (!cacheBlocks[block_index].pendingApply) && + (!cacheBlocks[block_index].pendingWB)) { + assert(!cacheBlocks[block_index].needsApply); + assert(!cacheBlocks[block_index].pendingData); + activeBits.pop_front(); + return std::make_tuple( + WorkLocation::IN_CACHE, block_index, index_offset); + // Otherwise if it is in memory + } else if ((cacheBlocks[block_index].addr != addr)) { + activeBits.pop_front(); + return std::make_tuple( + WorkLocation::IN_MEMORY, addr, index_offset); + } + } + activeBits.pop_front(); + activeBits.push_back(index); + visited_bits++; + } + + return std::make_tuple(WorkLocation::GARBAGE, 0, 0); +} + +void +CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) +{ + WorkLocation bit_status; + Addr location; + int offset; + + std::tie(bit_status, location, offset) = getOptimalPullAddr(); + + if (bit_status != WorkLocation::GARBAGE) { + if (bit_status == WorkLocation::PENDING_READ) { + // renaming the outputs to thier local names. + Addr addr = location; + int index_offset = offset; + + uint64_t send_mask = pendingVertexPullReads[addr]; + uint64_t vertex_send_mask = send_mask & (1 << index_offset); + assert(vertex_send_mask == 0); + send_mask |= (1 << index_offset); + pendingVertexPullReads[addr] = send_mask; + numPullsReceived--; + } + if (bit_status == WorkLocation::IN_CACHE) { + // renaming the outputs to their local names. + int block_index = (int) location; + int wl_offset = offset; + + Addr addr = cacheBlocks[block_index].addr; + Addr vertex_addr = addr + (wl_offset * sizeof(WorkListItem)); + int slice_base_index = getBitIndexBase(addr); + + needsPush[slice_base_index + wl_offset] = 0; + _workCount--; + + uint32_t delta; + bool do_push, do_wb; + std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply( + cacheBlocks[block_index].items[wl_offset]); + cacheBlocks[block_index].needsWB |= do_wb; + if (do_push) { + owner->recvVertexPush(vertex_addr, delta, + cacheBlocks[block_index].items[wl_offset].edgeIndex, + cacheBlocks[block_index].items[wl_offset].degree); + } else { + DPRINTF(CoalesceEngine, "%s: Fuck!.\n", __func__); + owner->recvPrevPullCorrection(); + } + stats.verticesPushed++; + stats.lastVertexPushTime = curTick() - stats.lastResetTick; + numPullsReceived--; + } + if (bit_status == WorkLocation::IN_MEMORY) { + if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) { + Addr addr = location; + int index_offset = offset; + uint64_t send_mask = (1 << index_offset); + assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end()); + PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); + SenderState* sender_state = new SenderState(true); + pkt->pushSenderState(sender_state); + memPort.sendPacket(pkt); + onTheFlyReqs++; + maxPotentialPostPushWB++; + pendingVertexPullReads[addr] = send_mask; + numPullsReceived--; + } + } + } + + stats.bitvectorSearchStatus[bit_status]++; + + if (numPullsReceived > 0) { + memoryFunctionQueue.emplace_back( + [this] (int slice_base, Tick schedule_tick) { + processNextVertexPull(slice_base, schedule_tick); + }, 0, curTick()); + DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input " + "0 to memoryFunctionQueue.\n", __func__); + } +} + +void +CoalesceEngine::recvMemRetry() +{ + DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__); + + if (!nextMemoryEvent.pending()) { + DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__); + return; + } + assert(!nextMemoryEvent.scheduled()); + nextMemoryEvent.wake(); + schedule(nextMemoryEvent, nextCycle()); +} + +void +CoalesceEngine::recvVertexPull() +{ + bool should_schedule = (numPullsReceived == 0); + numPullsReceived++; + + stats.verticesPulled++; + stats.lastVertexPullTime = curTick() - stats.lastResetTick; + if (should_schedule) { + memoryFunctionQueue.emplace_back( + [this] (int slice_base, Tick schedule_tick) { + processNextVertexPull(slice_base, schedule_tick); + }, 0, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + } +} + +CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) + : statistics::Group(&_coalesce), + coalesce(_coalesce), + lastResetTick(0), + ADD_STAT(numVertexReads, statistics::units::Count::get(), + "Number of memory vertecies read from cache."), + ADD_STAT(numVertexWrites, statistics::units::Count::get(), + "Number of memory vertecies written to cache."), + ADD_STAT(readHits, statistics::units::Count::get(), + "Number of cache hits."), + ADD_STAT(readMisses, statistics::units::Count::get(), + "Number of cache misses."), + ADD_STAT(readHitUnderMisses, statistics::units::Count::get(), + "Number of cache hit under misses."), + ADD_STAT(mshrEntryShortage, statistics::units::Count::get(), + "Number of cache rejections caused by entry shortage."), + ADD_STAT(mshrTargetShortage, statistics::units::Count::get(), + "Number of cache rejections caused by target shortage."), + ADD_STAT(responsePortShortage, statistics::units::Count::get(), + "Number of times a response has been " + "delayed because of port shortage. "), + ADD_STAT(numMemoryBlocks, statistics::units::Count::get(), + "Number of times memory bandwidth was not available."), + ADD_STAT(numDoubleMemReads, statistics::units::Count::get(), + "Number of times a memory block has been read twice. " + "Once for push and once to populate the cache."), + ADD_STAT(verticesPulled, statistics::units::Count::get(), + "Number of times a pull request has been sent by PushEngine."), + ADD_STAT(verticesPushed, statistics::units::Count::get(), + "Number of times a vertex has been pushed to the PushEngine"), + ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(), + "Time of the last pull request. (Relative to reset_stats)"), + ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(), + "Time of the last vertex push. (Relative to reset_stats)"), + ADD_STAT(numInvalidApplies, statistics::units::Count::get(), + "Number of times a line has become busy" + " while waiting to be applied."), + ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(), + "Number of times a scheduled memory function has been invalid."), + ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(), + "Distribution for the location of vertex searches."), + ADD_STAT(hitRate, statistics::units::Ratio::get(), + "Hit rate in the cache."), + ADD_STAT(vertexPullBW, statistics::units::Rate::get(), + "Rate at which pull requests arrive."), + ADD_STAT(vertexPushBW, statistics::units::Rate::get(), + "Rate at which vertices are pushed."), + ADD_STAT(mshrEntryLength, statistics::units::Count::get(), + "Histogram on the length of the mshr entries."), + ADD_STAT(bitvectorLength, statistics::units::Count::get(), + "Histogram of the length of the bitvector."), + ADD_STAT(responseQueueLatency, statistics::units::Second::get(), + "Histogram of the response latency to WLEngine. (ns)"), + ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(), + "Histogram of the latency of processing a memory function.") +{ +} + +void +CoalesceEngine::CoalesceStats::regStats() +{ + using namespace statistics; + + bitvectorSearchStatus.init(NUM_STATUS); + bitvectorSearchStatus.subname(0, "PENDING_READ"); + bitvectorSearchStatus.subname(1, "IN_CACHE"); + bitvectorSearchStatus.subname(2, "IN_MEMORY"); + bitvectorSearchStatus.subname(3, "GARBAGE"); + + hitRate = (readHits + readHitUnderMisses) / + (readHits + readHitUnderMisses + readMisses); + + vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime; + + vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime; + + mshrEntryLength.init(coalesce.params().num_tgts_per_mshr); + bitvectorLength.init(64); + responseQueueLatency.init(64); + memoryFunctionLatency.init(64); +} + +void +CoalesceEngine::CoalesceStats::resetStats() +{ + statistics::Group::resetStats(); + + lastResetTick = curTick(); +} + +} // namespace gem5 diff --git a/src/accl/graph/sega/coalesce_engine_bak.hh b/src/accl/graph/sega/coalesce_engine_bak.hh new file mode 100644 index 0000000000..0787a334c1 --- /dev/null +++ b/src/accl/graph/sega/coalesce_engine_bak.hh @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ + +#include + +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/base/graph_workload.hh" +#include "accl/graph/sega/base_memory_engine.hh" +#include "base/cprintf.hh" +#include "base/statistics.hh" +#include "params/CoalesceEngine.hh" + + + +namespace gem5 +{ + +enum WorkLocation +{ + PENDING_READ, + IN_CACHE, + IN_MEMORY, + GARBAGE, + NUM_STATUS +}; + +class MPU; + +class CoalesceEngine : public BaseMemoryEngine +{ + private: + struct Block + { + WorkListItem* items; + Addr addr; + uint64_t busyMask; + bool valid; + bool needsApply; + bool needsWB; + bool pendingData; + bool pendingApply; + bool pendingWB; + Tick lastChangedTick; + // TODO: This might be useful in the future + // Tick lastWLWriteTick; + Block() {} + Block(int num_elements): + addr(-1), + busyMask(0), + valid(false), + needsApply(false), + needsWB(false), + pendingData(false), + pendingApply(false), + pendingWB(false), + lastChangedTick(0), + { + items = new WorkListItem [num_elements]; + } + + std::string to_string() { + return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, " + "needsApply: %s, needsWB: %s, pendingData: %s, " + "pendingApply: %s, pendingWB: %s, lastChangedTick: %lu}", + addr, busyMask, valid ? "true" : "false", + needsApply ? "true" : "false", needsWB ? "true" : "false", + pendingData ? "true" : "false", pendingApply ? "true" : "false", + pendingWB ? "true" : "false", lastChangedTick); + } + }; + + struct SenderState : public Packet::SenderState + { + bool isRetry; + SenderState(bool is_retry): isRetry(is_retry) {} + }; + MPU* owner; + GraphWorkload* graphWorkload; + + int numLines; + int numElementsPerLine; + Block* cacheBlocks; + + int onTheFlyReqs; + int numMSHREntries; + int numTgtsPerMSHR; + std::unordered_map> MSHR; + int maxRespPerCycle; + std::deque> responseQueue; + + int _workCount; + int numPullsReceived; + UniqueFIFO applyQueue; + std::bitset needsPush; + std::deque activeBits; + int postPushWBQueueSize; + std::deque> postPushWBQueue; + + int getBlockIndex(Addr addr); + int getBitIndexBase(Addr addr); + Addr getBlockAddrFromBitIndex(int index); + std::tuple getOptimalPullAddr(); + + int maxPotentialPostPushWB; + // A map from addr to sendMask. sendMask determines which bytes to + // send for push when getting the read response from memory. + std::unordered_map pendingVertexPullReads; + + MemoryEvent nextMemoryEvent; + void processNextMemoryEvent(); + void processNextRead(int block_index, Tick schedule_tick); + void processNextWriteBack(int block_index, Tick schedule_tick); + void processNextVertexPull(int ignore, Tick schedule_tick); + void processNextPostPushWB(int ignore, Tick schedule_tick); + std::deque, int, Tick>> memoryFunctionQueue; + + EventFunctionWrapper nextResponseEvent; + void processNextResponseEvent(); + + EventFunctionWrapper nextPreWBApplyEvent; + void processNextPreWBApplyEvent(); + + struct CoalesceStats : public statistics::Group + { + CoalesceStats(CoalesceEngine &coalesce); + + virtual void regStats() override; + + virtual void resetStats() override; + + CoalesceEngine &coalesce; + + Tick lastResetTick; + + statistics::Scalar numVertexReads; + statistics::Scalar numVertexWrites; + statistics::Scalar readHits; + statistics::Scalar readMisses; + statistics::Scalar readHitUnderMisses; + statistics::Scalar mshrEntryShortage; + statistics::Scalar mshrTargetShortage; + statistics::Scalar responsePortShortage; + statistics::Scalar numMemoryBlocks; + statistics::Scalar numDoubleMemReads; + statistics::Scalar verticesPulled; + statistics::Scalar verticesPushed; + statistics::Scalar lastVertexPullTime; + statistics::Scalar lastVertexPushTime; + statistics::Scalar numInvalidApplies; + statistics::Scalar numInvalidWriteBacks; + + statistics::Vector bitvectorSearchStatus; + + statistics::Formula hitRate; + statistics::Formula vertexPullBW; + statistics::Formula vertexPushBW; + + statistics::Histogram mshrEntryLength; + statistics::Histogram bitvectorLength; + statistics::Histogram responseQueueLatency; + statistics::Histogram memoryFunctionLatency; + }; + + CoalesceStats stats; + + protected: + virtual void recvMemRetry() override; + virtual bool handleMemResp(PacketPtr pkt) override; + + public: + PARAMS(CoalesceEngine); + CoalesceEngine(const Params ¶ms); + void registerMPU(MPU* mpu); + + void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; } + virtual void recvFunctional(PacketPtr pkt); + + bool recvWLRead(Addr addr); + void recvWLWrite(Addr addr, WorkListItem wl); + + int workCount() { return _workCount; } + void recvVertexPull(); + + bool done(); +}; + +} + +#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ From 80b3803f040e09cae9f083e39d637c6445aab247 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 7 Nov 2022 00:05:27 -0800 Subject: [PATCH 211/247] First working and tested version of workdirectory. --- configs/accl/bfs.py | 1 + configs/accl/sega.py | 6 +- src/accl/graph/base/data_structs.hh | 23 +- src/accl/graph/base/graph_workload.cc | 236 ++-- src/accl/graph/base/graph_workload.hh | 67 +- src/accl/graph/sega/CenteralController.py | 2 +- src/accl/graph/sega/CoalesceEngine.py | 7 +- src/accl/graph/sega/CoalesceEngine_bak.py | 50 - src/accl/graph/sega/SConscript | 5 +- src/accl/graph/sega/centeral_controller.cc | 14 +- src/accl/graph/sega/centeral_controller.hh | 3 +- src/accl/graph/sega/coalesce_engine.cc | 932 +++++++------- src/accl/graph/sega/coalesce_engine.hh | 117 +- src/accl/graph/sega/coalesce_engine_bak.cc | 1308 -------------------- src/accl/graph/sega/coalesce_engine_bak.hh | 218 ---- src/accl/graph/sega/enums.cc | 57 + src/accl/graph/sega/enums.hh | 66 + src/accl/graph/sega/mpu.cc | 6 - src/accl/graph/sega/mpu.hh | 6 +- src/accl/graph/sega/push_engine.cc | 37 +- src/accl/graph/sega/push_engine.hh | 2 +- src/accl/graph/sega/wl_engine.cc | 28 +- src/accl/graph/sega/wl_engine.hh | 1 + src/accl/graph/sega/work_directory.hh | 212 ++++ src/mem/mem_ctrl.cc | 2 +- 25 files changed, 1030 insertions(+), 2376 deletions(-) delete mode 100644 src/accl/graph/sega/CoalesceEngine_bak.py delete mode 100644 src/accl/graph/sega/coalesce_engine_bak.cc delete mode 100644 src/accl/graph/sega/coalesce_engine_bak.hh create mode 100644 src/accl/graph/sega/enums.cc create mode 100644 src/accl/graph/sega/enums.hh create mode 100644 src/accl/graph/sega/work_directory.hh diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py index fc32b96642..a201acd4d1 100644 --- a/configs/accl/bfs.py +++ b/configs/accl/bfs.py @@ -68,6 +68,7 @@ def get_inputs(): m5.instantiate() + system.create_pop_count_directory(256) system.create_bfs_workload(init_addr, init_value) exit_event = m5.simulate() print( diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 0f4b133791..54f22b1377 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -54,8 +54,8 @@ def __init__(self, edge_memory_size: str, cache_size: str): attached_memory_atom_size=32, cache_size=cache_size, num_mshr_entry=64, - num_tgts_per_mshr=64, max_resp_per_cycle=8, + active_buffer_size = 64, post_push_wb_queue_size=64, ) self.push_engine = PushEngine( @@ -139,6 +139,10 @@ def __init__(self, num_mpus, cache_size, graph_path): self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] + def create_pop_count_directory(self, atoms_per_block): + for gpt in self.gpts: + gpt.coalesce_engine.createPopCountDirectory(atoms_per_block) + def create_bfs_workload(self, init_addr, init_value): self.ctrl.createBFSWorkload(init_addr, init_value) diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index 070e635736..84233ae39c 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -36,8 +36,6 @@ #include #include -#define MAX_BITVECTOR_SIZE (1 << 28) - namespace gem5 { @@ -45,33 +43,28 @@ struct __attribute__ ((packed)) WorkListItem { uint32_t tempProp : 32; uint32_t prop : 32; + uint32_t degree : 32; uint32_t edgeIndex : 32; - uint32_t degree : 31; - bool active: 1; std::string to_string() { - return csprintf( - "WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, " - "degree: %u, active: %s}", tempProp, prop, edgeIndex, degree, - active ? "true" : "false"); + return csprintf("WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, " + "degree: %u}", tempProp, prop, edgeIndex, degree); } WorkListItem(): tempProp(0), prop(0), - edgeIndex(0), degree(0), - active(false) + edgeIndex(0) {} WorkListItem(uint32_t temp_prop, uint32_t prop, - uint32_t edge_index, uint32_t degree, bool active): + uint32_t degree, uint32_t edge_index): tempProp(temp_prop), prop(prop), - edgeIndex(edge_index), degree(degree), - active(active) + edgeIndex(edge_index) {} }; @@ -111,8 +104,8 @@ struct MetaEdge { std::string to_string() { - return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u}", - src, dst, weight); + return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u, value: %u}", + src, dst, weight, value); } }; diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 07accff44f..446509201f 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -56,39 +56,27 @@ readFromFloat(float value) return float_bits; } -BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size): - GraphWorkload(), initValue(init_value), atomSize(atom_size) -{ - initAddrBase = roundDown(init_addr, atomSize); - initIndex = (init_addr - initAddrBase) / atomSize; - numElementsPerLine = (int) (atomSize / sizeof(WorkListItem)); -} - - void -BFSWorkload::init(PacketPtr pkt, int bit_index_base, - std::bitset& needsPush, - std::deque& activeBits, - int& _workCount) +BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir) { - if (pkt->getAddr() == initAddrBase) { - WorkListItem items[numElementsPerLine]; + size_t pkt_size = pkt->getSize(); + uint64_t aligned_addr = roundDown(initAddr, pkt_size); - pkt->writeDataToBlock((uint8_t*) items, atomSize); + if (pkt->getAddr() == aligned_addr) { + int num_elements = (int) (pkt_size / sizeof(WorkListItem)); + WorkListItem items[num_elements]; - items[initIndex].tempProp = initValue; - items[initIndex].prop = initValue; - if (items[initIndex].degree > 0) { - needsPush[bit_index_base + initIndex] = 1; - activeBits.push_back(bit_index_base + initIndex); - _workCount++; - } + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem)); + items[index].tempProp = initValue; + if (activeCondition(items[index])) { + dir->activate(aligned_addr); + } pkt->deleteData(); pkt->allocate(); - pkt->setDataFromBlock((uint8_t*) items, atomSize); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); } - } uint32_t @@ -104,28 +92,16 @@ BFSWorkload::propagate(uint32_t value, uint32_t weight) } bool -BFSWorkload::applyCondition(WorkListItem wl) +BFSWorkload::activeCondition(WorkListItem wl) { - return wl.tempProp < wl.prop; -} - -bool -BFSWorkload::preWBApply(WorkListItem& wl) -{ - if (applyCondition(wl)) { - wl.prop = wl.tempProp; - if (wl.degree > 0) { - return true; - } - } - return false; + return (wl.tempProp < wl.prop) && (wl.degree > 0); } -std::tuple -BFSWorkload::prePushApply(WorkListItem& wl) +uint32_t +BFSWorkload::apply(WorkListItem& wl) { - uint32_t value = wl.prop; - return std::make_tuple(value, true, false); + wl.prop = wl.tempProp; + return wl.prop; } std::string @@ -137,92 +113,92 @@ BFSWorkload::printWorkListItem(const WorkListItem wl) ); } -PRWorkload::PRWorkload(float alpha, float threshold, int atom_size): - GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size) -{ - numElementsPerLine = (int) (atomSize / sizeof(WorkListItem)); -} - -void -PRWorkload::init(PacketPtr pkt, int bit_index_base, - std::bitset& needsPush, - std::deque& activeBits, - int& _workCount) -{ - WorkListItem items[numElementsPerLine]; - - pkt->writeDataToBlock((uint8_t*) items, atomSize); - for (int i = 0; i < numElementsPerLine; i++) { - items[i].tempProp = readFromFloat(0); - items[i].prop = readFromFloat(1 - alpha); - if (items[i].degree > 0) { - needsPush[bit_index_base + i] = 1; - activeBits.push_back(bit_index_base + i); - _workCount++; - } - } - pkt->deleteData(); - pkt->allocate(); - pkt->setDataFromBlock((uint8_t*) items, atomSize); -} - -uint32_t -PRWorkload::reduce(uint32_t update, uint32_t value) -{ - float update_float = writeToFloat(update); - float value_float = writeToFloat(value); - return readFromFloat(update_float + value_float); -} - -uint32_t -PRWorkload::propagate(uint32_t value, uint32_t weight) -{ - float value_float = writeToFloat(value); - float weight_float = 1.0; - - return readFromFloat(alpha * value_float * weight_float); -} - -bool -PRWorkload::applyCondition(WorkListItem wl) -{ - float temp_float = writeToFloat(wl.tempProp); - float prop_float = writeToFloat(wl.prop); - float dist = std::abs(temp_float - prop_float); - return dist >= threshold; -} - -bool -PRWorkload::preWBApply(WorkListItem& wl) -{ - if (applyCondition(wl) && (wl.degree > 0)) { - return true; - } - return false; -} - -std::tuple -PRWorkload::prePushApply(WorkListItem& wl) -{ - if (applyCondition(wl)) { - float temp_float = writeToFloat(wl.tempProp); - float prop_float = writeToFloat(wl.prop); - float delta = (temp_float - prop_float) / wl.degree; - uint32_t delta_uint = readFromFloat(delta); - wl.prop = wl.tempProp; - return std::make_tuple(delta_uint, true, true); - } - return std::make_tuple(0, false, false); -} - -std::string -PRWorkload::printWorkListItem(const WorkListItem wl) -{ - float temp_float = writeToFloat(wl.tempProp); - return csprintf( - "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}", - temp_float, temp_float, wl.degree, wl.edgeIndex - ); -} +// PRWorkload::PRWorkload(float alpha, float threshold, int atom_size): +// GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size) +// { +// numElementsPerLine = (int) (atomSize / sizeof(WorkListItem)); +// } + +// void +// PRWorkload::init(PacketPtr pkt, int bit_index_base, +// std::bitset& needsPush, +// std::deque& activeBits, +// int& _workCount) +// { +// WorkListItem items[numElementsPerLine]; + +// pkt->writeDataToBlock((uint8_t*) items, atomSize); +// for (int i = 0; i < numElementsPerLine; i++) { +// items[i].tempProp = readFromFloat(0); +// items[i].prop = readFromFloat(1 - alpha); +// if (items[i].degree > 0) { +// needsPush[bit_index_base + i] = 1; +// activeBits.push_back(bit_index_base + i); +// _workCount++; +// } +// } +// pkt->deleteData(); +// pkt->allocate(); +// pkt->setDataFromBlock((uint8_t*) items, atomSize); +// } + +// uint32_t +// PRWorkload::reduce(uint32_t update, uint32_t value) +// { +// float update_float = writeToFloat(update); +// float value_float = writeToFloat(value); +// return readFromFloat(update_float + value_float); +// } + +// uint32_t +// PRWorkload::propagate(uint32_t value, uint32_t weight) +// { +// float value_float = writeToFloat(value); +// float weight_float = 1.0; + +// return readFromFloat(alpha * value_float * weight_float); +// } + +// bool +// PRWorkload::applyCondition(WorkListItem wl) +// { +// float temp_float = writeToFloat(wl.tempProp); +// float prop_float = writeToFloat(wl.prop); +// float dist = std::abs(temp_float - prop_float); +// return dist >= threshold; +// } + +// bool +// PRWorkload::preWBApply(WorkListItem& wl) +// { +// if (applyCondition(wl) && (wl.degree > 0)) { +// return true; +// } +// return false; +// } + +// std::tuple +// PRWorkload::apply(WorkListItem& wl) +// { +// if (applyCondition(wl)) { +// float temp_float = writeToFloat(wl.tempProp); +// float prop_float = writeToFloat(wl.prop); +// float delta = (temp_float - prop_float) / wl.degree; +// uint32_t delta_uint = readFromFloat(delta); +// wl.prop = wl.tempProp; +// return std::make_tuple(delta_uint, true, true); +// } +// return std::make_tuple(0, false, false); +// } + +// std::string +// PRWorkload::printWorkListItem(const WorkListItem wl) +// { +// float temp_float = writeToFloat(wl.tempProp); +// return csprintf( +// "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}", +// temp_float, temp_float, wl.degree, wl.edgeIndex +// ); +// } } // namespace gem5 diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh index 6bbc4935c2..f71955bd16 100644 --- a/src/accl/graph/base/graph_workload.hh +++ b/src/accl/graph/base/graph_workload.hh @@ -34,6 +34,7 @@ #include #include "accl/graph/base/data_structs.hh" +#include "accl/graph/sega/work_directory.hh" #include "mem/packet.hh" @@ -46,70 +47,54 @@ class GraphWorkload GraphWorkload() {} ~GraphWorkload() {} - virtual void init(PacketPtr pkt, int bit_index_base, - std::bitset& needsPush, - std::deque& activeBits, - int& _workCount) = 0; + virtual void init(PacketPtr pkt, WorkDirectory* dir) = 0; virtual uint32_t reduce(uint32_t update, uint32_t value) = 0; virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0; - virtual bool applyCondition(WorkListItem wl) = 0; - virtual bool preWBApply(WorkListItem& wl) = 0; - virtual std::tuple prePushApply(WorkListItem& wl) = 0; + virtual uint32_t apply(WorkListItem& wl) = 0; + virtual bool activeCondition(WorkListItem wl) = 0; virtual std::string printWorkListItem(const WorkListItem wl) = 0; }; class BFSWorkload : public GraphWorkload { private: - uint64_t initAddrBase; - int initIndex; + uint64_t initAddr; uint32_t initValue; - int numElementsPerLine; - int atomSize; public: - BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size); + BFSWorkload(uint64_t init_addr, uint32_t init_value): + initAddr(init_addr), initValue(init_value) + {} ~BFSWorkload() {} - virtual void init(PacketPtr pkt, int bit_index_base, - std::bitset& needsPush, - std::deque& activeBits, - int& _workCount); + virtual void init(PacketPtr pkt, WorkDirectory* dir); virtual uint32_t reduce(uint32_t update, uint32_t value); virtual uint32_t propagate(uint32_t value, uint32_t weight); - virtual bool applyCondition(WorkListItem wl); - virtual bool preWBApply(WorkListItem& wl); - virtual std::tuple prePushApply(WorkListItem& wl); + virtual uint32_t apply(WorkListItem& wl); + virtual bool activeCondition(WorkListItem wl); virtual std::string printWorkListItem(const WorkListItem wl); }; -class PRWorkload : public GraphWorkload -{ - private: - float alpha; - float threshold; +// class PRWorkload : public GraphWorkload +// { +// private: +// float alpha; +// float threshold; - int numElementsPerLine; - int atomSize; - - public: - PRWorkload(float alpha, float threshold, int atom_size); +// public: +// PRWorkload(float alpha, float threshold); - ~PRWorkload() {} +// ~PRWorkload() {} - virtual void init(PacketPtr pkt, int bit_index_base, - std::bitset& needsPush, - std::deque& activeBits, - int& _workCount); - virtual uint32_t reduce(uint32_t update, uint32_t value); - virtual uint32_t propagate(uint32_t value, uint32_t weight); - virtual bool applyCondition(WorkListItem wl); - virtual bool preWBApply(WorkListItem& wl); - virtual std::tuple prePushApply(WorkListItem& wl); - virtual std::string printWorkListItem(const WorkListItem wl); -}; +// virtual void init(PacketPtr pkt, WorkDirectory* dir); +// virtual uint32_t reduce(uint32_t update, uint32_t value); +// virtual uint32_t propagate(uint32_t value, uint32_t weight); +// virtual uint32_t apply(WorkListItem& wl); +// virtual bool activeCondition(WorkListItem wl); +// virtual std::string printWorkListItem(const WorkListItem wl); +// }; } diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index 09a997696d..0c21833a05 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -43,6 +43,6 @@ class CenteralController(ClockedObject): cxx_exports = [ PyBindMethod("createBFSWorkload"), - PyBindMethod("createPRWorkload"), + # PyBindMethod("createPRWorkload"), PyBindMethod("printAnswerToHostSimout") ] diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index 8ec9214b49..a447dedc3d 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -27,6 +27,7 @@ from m5.params import * from m5.proxy import * +from m5.util.pybind import PyBindMethod from m5.objects.BaseMemoryEngine import BaseMemoryEngine class CoalesceEngine(BaseMemoryEngine): @@ -40,9 +41,13 @@ class CoalesceEngine(BaseMemoryEngine): max_resp_per_cycle = Param.Int("Maximum number of vertices to send to " "requestor in each cycle. Used to limit b/w.") - + active_buffer_size = Param.Int("Maximum number of memory active memory " + "atoms ready to send updates. This parameter " + "and post_push_wb_queue_size should be set " + "in tandem. Probably, they should be equal.") post_push_wb_queue_size = Param.Int("Maximum number of pending wb after " "apply process for applications that require " "the apply process to happen exactly before " "pushing the edgePointer to the PushEngine.") + cxx_exports = [PyBindMethod("createPopCountDirectory")] diff --git a/src/accl/graph/sega/CoalesceEngine_bak.py b/src/accl/graph/sega/CoalesceEngine_bak.py deleted file mode 100644 index 1fd3b968c5..0000000000 --- a/src/accl/graph/sega/CoalesceEngine_bak.py +++ /dev/null @@ -1,50 +0,0 @@ -# -*- coding: utf-8 -*- -# Copyright (c) 2017 Jason Lowe-Power -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from m5.params import * -from m5.proxy import * -from m5.objects.BaseMemoryEngine import BaseMemoryEngine - -class CoalesceEngine(BaseMemoryEngine): - type = 'CoalesceEngine' - cxx_header = "accl/graph/sega/coalesce_engine.hh" - cxx_class = 'gem5::CoalesceEngine' - - cache_size = Param.MemorySize("Size of the internal SRAM array.") - - num_mshr_entry = Param.Int("Number of MSHR entries.") - - num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.") - - max_resp_per_cycle = Param.Int("Maximum number of vertices to send to " - "requestor in each cycle. Used to limit b/w.") - - post_push_wb_queue_size = Param.Int("Maximum number of pending wb after " - "apply process for applications that require " - "the apply process to happen exactly before " - "pushing the edgePointer to the PushEngine.") - diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript index 5d411be9ac..b3e1a838fb 100644 --- a/src/accl/graph/sega/SConscript +++ b/src/accl/graph/sega/SConscript @@ -37,6 +37,7 @@ SimObject("WLEngine.py", sim_objects=["WLEngine"]) Source("base_memory_engine.cc") Source("centeral_controller.cc") Source("coalesce_engine.cc") +Source("enums.cc") Source("mpu.cc") Source("push_engine.cc") Source("wl_engine.cc") @@ -45,10 +46,10 @@ DebugFlag("BaseMemoryEngine") DebugFlag("CenteralController") DebugFlag("CacheBlockState") DebugFlag("CoalesceEngine") -DebugFlag("FinalAnswer") DebugFlag("PushEngine") DebugFlag("SEGAStructureSize") +DebugFlag("MSDebug") DebugFlag("WLEngine") CompoundFlag("MPU", ["CoalesceEngine", "PushEngine", - "WLEngine", "BaseMemoryEngine"]) \ No newline at end of file + "WLEngine", "BaseMemoryEngine"]) diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index fc2262e111..883992e64e 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -82,6 +82,7 @@ CenteralController::startup() panic_if(!image.write(proxy), "%s: Unable to write image."); for (auto mpu: mpuVector) { + mpu->postMemInitSetup(); if (!mpu->running() && (mpu->workCount()> 0)) { mpu->start(); } @@ -106,14 +107,14 @@ CenteralController::createReadPacket(Addr addr, unsigned int size) void CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value) { - workload = new BFSWorkload(init_addr, init_value, system->cacheLineSize()); + workload = new BFSWorkload(init_addr, init_value); } -void -CenteralController::createPRWorkload(float alpha, float threshold) -{ - workload = new PRWorkload(alpha, threshold, system->cacheLineSize()); -} +// void +// CenteralController::createPRWorkload(float alpha, float threshold) +// { +// workload = new PRWorkload(alpha, threshold, system->cacheLineSize()); +// } void CenteralController::recvDoneSignal() @@ -144,6 +145,7 @@ CenteralController::printAnswerToHostSimout() } pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize()); for (int i = 0; i < num_items; i++) { + workload->apply(items[i]); std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i, workload->printWorkListItem(items[i])); diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index 9ddb1b35f0..6eb07dbcac 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -35,7 +35,6 @@ #include "accl/graph/base/graph_workload.hh" #include "accl/graph/sega/mpu.hh" #include "base/addr_range.hh" -#include "debug/FinalAnswer.hh" #include "params/CenteralController.hh" #include "sim/clocked_object.hh" #include "sim/system.hh" @@ -64,7 +63,7 @@ class CenteralController : public ClockedObject virtual void startup() override; void createBFSWorkload(Addr init_addr, uint32_t init_value); - void createPRWorkload(float alpha, float threshold); + // void createPRWorkload(float alpha, float threshold); void recvDoneSignal(); diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 66ff66c068..0aa61345f7 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -34,6 +34,7 @@ #include "base/intmath.hh" #include "debug/CacheBlockState.hh" #include "debug/CoalesceEngine.hh" +#include "debug/MSDebug.hh" #include "debug/SEGAStructureSize.hh" #include "mem/packet_access.hh" #include "sim/sim_exit.hh" @@ -42,26 +43,23 @@ namespace gem5 { CoalesceEngine::CoalesceEngine(const Params ¶ms): - BaseMemoryEngine(params), + BaseMemoryEngine(params), lastAtomAddr(0), numLines((int) (params.cache_size / peerMemoryAtomSize)), numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry), - maxRespPerCycle(params.max_resp_per_cycle), cacheWorkCount(0), - numPullsReceived(0), activeBufferSize(params.post_push_wb_queue_size), + maxRespPerCycle(params.max_resp_per_cycle), + pullsReceived(0), pullsScheduled(0), pendingPullReads(0), + activeBufferSize(params.active_buffer_size), postPushWBQueueSize(params.post_push_wb_queue_size), - pendingPullReads(0), nextMemoryEvent([this] { processNextMemoryEvent(); }, name() + ".nextMemoryEvent"), nextResponseEvent([this] { processNextResponseEvent(); }, name() + ".nextResponseEvent"), - nextPreWBApplyEvent([this] { - processNextPreWBApplyEvent(); - }, name() + ".nextPreWBApplyEvent"), - nextPrePushApplyEvent([this] { - processNextPrePushApplyEvent(); - }, name() + ".nextPrePushApplyEvent"), + nextApplyEvent([this] { + processNextApplyEvent(); + }, name() + ".nextApplyEvent"), stats(*this) { assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine)); @@ -69,6 +67,8 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): for (int i = 0; i < numLines; i++) { cacheBlocks[i] = Block(numElementsPerLine); } + activeBuffer.clear(); + postPushWBQueue.clear(); } void @@ -85,7 +85,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt) Addr addr = pkt->getAddr(); int block_index = getBlockIndex(addr); - // TODO: Check postPushWBQueue for hits + // FIXME: Check postPushWBQueue for hits if ((cacheBlocks[block_index].addr == addr) && (cacheBlocks[block_index].valid)) { assert(cacheBlocks[block_index].state == CacheState::IDLE); @@ -97,54 +97,70 @@ CoalesceEngine::recvFunctional(PacketPtr pkt) memPort.sendFunctional(pkt); } } else { - int bit_index_base = getBitIndexBase(pkt->getAddr()); - // FIXME: Pass workdirectory to graphworkload.init - graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount); + graphWorkload->init(pkt, directory); + if (pkt->getAddr() > lastAtomAddr) { + lastAtomAddr = pkt->getAddr(); + } memPort.sendFunctional(pkt); } } +void +CoalesceEngine::postMemInitSetup() +{ + directory->setLastAtomAddr(lastAtomAddr); +} + +void +CoalesceEngine::createPopCountDirectory(int atoms_per_block) +{ + directory = new PopCountDirectory( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); +} + bool CoalesceEngine::done() { - // FIXME: Fix this later - return applyQueue.empty() && needsPush.none() && - memoryFunctionQueue.empty() && (onTheFlyReqs == 0); + return memoryFunctionQueue.empty() && activeCacheBlocks.empty() && + activeBuffer.empty() && directory->empty() && (onTheFlyReqs == 0); } -// addr should be aligned to peerMemoryAtomSize -int -CoalesceEngine::getBlockIndex(Addr addr) +bool +CoalesceEngine::timeToPull() { - assert((addr % peerMemoryAtomSize) == 0); - Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); - return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines; + return (activeBuffer.size() + pendingPullReads) < activeBufferSize; } -// FIXME: This and the next function should be moved to the -// WorkDirectory. -// addr should be aligned to peerMemoryAtomSize -int -CoalesceEngine::getBitIndexBase(Addr addr) +bool +CoalesceEngine::canSchedulePull() { - assert((addr % peerMemoryAtomSize) == 0); - Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); - int atom_index = (int) (trimmed_addr / peerMemoryAtomSize); - int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem)); - return atom_index * block_bits; + // TODO: Maybe a good idea to change this to + // activeBuffer.size() + pendingPullReads + pullsScheduled < activeBufferSize + return pullsScheduled < 1; } -// FIXME: Read FIXME: Above -// index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem)) -Addr -CoalesceEngine::getBlockAddrFromBitIndex(int index) +bool +CoalesceEngine::workLeftInMem() { - assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0); - Addr trimmed_addr = index * sizeof(WorkListItem); - return peerMemoryRange.addIntlvBits(trimmed_addr); + return !directory->empty(); } bool +CoalesceEngine::pullCondition() +{ + return ((activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize); +} + +// addr should be aligned to peerMemoryAtomSize +int +CoalesceEngine::getBlockIndex(Addr addr) +{ + assert((addr % peerMemoryAtomSize) == 0); + Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); + return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines; +} + +ReadReturnStatus CoalesceEngine::recvWLRead(Addr addr) { Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); @@ -163,6 +179,9 @@ CoalesceEngine::recvWLRead(Addr addr) if ((cacheBlocks[block_index].addr == aligned_addr) && (cacheBlocks[block_index].valid)) { // Hit + if (cacheBlocks[block_index].state == CacheState::LOCKED_FOR_APPLY) { + return ReadReturnStatus::REJECT_NO_ROLL; + } DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit.\n", __func__, addr); stats.readHits++; assert(cacheBlocks[block_index].state != CacheState::INVALID); @@ -197,7 +216,7 @@ CoalesceEngine::recvWLRead(Addr addr) schedule(nextResponseEvent, nextCycle()); } stats.numVertexReads++; - return true; + return ReadReturnStatus::ACCEPT; } else if ((cacheBlocks[block_index].addr == aligned_addr) && (cacheBlocks[block_index].state == CacheState::PENDING_DATA)) { // Hit under miss @@ -207,7 +226,6 @@ CoalesceEngine::recvWLRead(Addr addr) assert(!cacheBlocks[block_index].valid); assert(cacheBlocks[block_index].busyMask == 0); assert(!cacheBlocks[block_index].dirty); - assert(!cacheBlocks[block_index].needsPreWBApply); assert(MSHR.size() <= numMSHREntries); assert(MSHR.find(block_index) != MSHR.end()); @@ -217,7 +235,7 @@ CoalesceEngine::recvWLRead(Addr addr) DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); stats.numVertexReads++; - return true; + return ReadReturnStatus::ACCEPT; } else { // miss assert(cacheBlocks[block_index].addr != aligned_addr); @@ -232,20 +250,37 @@ CoalesceEngine::recvWLRead(Addr addr) if (cacheBlocks[block_index].state == CacheState::IDLE) { if (cacheBlocks[block_index].dirty) { cacheBlocks[block_index].state = CacheState::PENDING_WB; + cacheBlocks[block_index].lastChangedTick = curTick(); memoryFunctionQueue.emplace_back( [this] (int block_index, Tick schedule_tick) { processNextWriteBack(block_index, schedule_tick); }, block_index, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } } else { - // NOTE: move the cache block to invalid state - // FIXME: Fix the issue below. - // May need to activate tracking for this + // NOTE: The cache block could still be active but + // not dirty. If active we only have to active tracking + // but can throw the data away. + bool atom_active = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active |= graphWorkload->activeCondition( + cacheBlocks[block_index].items[index]); + } + if (atom_active) { + activeCacheBlocks.erase(block_index); + directory->activate(cacheBlocks[block_index].addr); + } + // NOTE: Bring the cache line to invalid state. + // NOTE: Above line where we set hasConflict to true + // does not matter anymore since we reset the cache line. cacheBlocks[block_index].reset(); } + return ReadReturnStatus::REJECT_NO_ROLL; + } else { + return ReadReturnStatus::REJECT_ROLL; } - // return int instead of bool to tell WLEngine to whether - // roll the first entry in the queue. - return false; } else { // cold miss assert(MSHR.find(block_index) == MSHR.end()); @@ -255,16 +290,21 @@ CoalesceEngine::recvWLRead(Addr addr) cacheBlocks[block_index].valid = false; cacheBlocks[block_index].dirty = false; cacheBlocks[block_index].hasConflict = false; - cacheBlocks[block_index].needsPreWBApply = false; cacheBlocks[block_index].state = CacheState::PENDING_DATA; cacheBlocks[block_index].lastChangedTick = curTick(); + + MSHR[block_index].push_back(addr); memoryFunctionQueue.emplace_back( [this] (int block_index, Tick schedule_tick) { processNextRead(block_index, schedule_tick); }, block_index, curTick()); - return true; + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + return ReadReturnStatus::ACCEPT; } else { - return false; + return ReadReturnStatus::REJECT_ROLL; } } } @@ -276,116 +316,87 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) assert(pkt->isResponse()); DPRINTF(CoalesceEngine, "%s: Received packet: %s from memory.\n", __func__, pkt->print()); + + onTheFlyReqs--; if (pkt->isWrite()) { DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__); delete pkt; - return true; - } - - onTheFlyReqs--; - Addr addr = pkt->getAddr(); - int block_index = getBlockIndex(addr); - WorkListItem* items = pkt->getPtr(); - - bool do_wb = false; - if (pkt->findNextSenderState()) { - assert(!((cacheBlocks[block_index].addr == addr) && - (cacheBlocks[block_index].valid))); - // We have read the address to send the wl and it is not in the - // cache. Simply send the items to the PushEngine. - - DPRINTF(CoalesceEngine, "%s: Received read response for pull read " - "for addr %lu.\n", __func__, addr); - int it = getBitIndexBase(addr); - uint64_t send_mask = pendingVertexPullReads[addr]; - // No applying of the line needed. - for (int i = 0; i < numElementsPerLine; i++) { - Addr vertex_addr = addr + i * sizeof(WorkListItem); - uint64_t vertex_send_mask = send_mask & (1 << i); - if (vertex_send_mask != 0) { - assert(needsPush[it + i] == 1); - needsPush[it + i] = 0; - _workCount--; - - uint32_t delta; - bool do_push, do_wb_v; - std::tie(delta, do_push, do_wb_v) = - graphWorkload->prePushApply(items[i]); - do_wb |= do_wb_v; - if (do_push) { - owner->recvVertexPush(vertex_addr, delta, - items[i].edgeIndex, items[i].degree); - } else { - // TODO: Add a stat to count this. - owner->recvPrevPullCorrection(); - } - stats.verticesPushed++; - stats.lastVertexPushTime = curTick() - stats.lastResetTick; - } + } else { + assert(pkt->isRead()); + Addr addr = pkt->getAddr(); + int block_index = getBlockIndex(addr); + ReadPurpose* purpose = pkt->findNextSenderState(); + + // NOTE: Regardless of where the pkt will go we have to release the + // reserved space for this pkt in the activeBuffer in case + // it was read from memory for placement in the activeBuffer. + // NOTE: Also we have to stop tracking the address for pullAddrs + if (purpose->dest() == ReadDestination::READ_FOR_PUSH) { + pendingPullReads--; + pendingPullAddrs.erase(addr); } - pendingVertexPullReads.erase(addr); - maxPotentialPostPushWB--; - } + if (cacheBlocks[block_index].addr == addr) { + // If it is in the cache, line should be in PENDING_DATA state. + // Regardless of the purpose for which it was read, it should + // be placed in the cache array. + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].dirty); + assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA); + + // NOTE: Since it is in PENDING_DATA state it + // should have an entry in the MSHR. + assert(MSHR.find(block_index) != MSHR.end()); + + pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items, + peerMemoryAtomSize); + + cacheBlocks[block_index].valid = true; + // HACK: In case the pkt was read for push but it was allocated + // for in the cache later on, we should cancel the future + // processNextRead for this block. We could set lastChangedTick + // to curTick() like usual. However, there is no way to ensure + // that processNextRead will be not be called on the same tick + // as the pkt arrives from the memory. Therefore, we will set + // the lastChangedTick to half a cycle before the actual time. + // We move that back in time because it would be fine if + // processNextRead happened before pkt arriveed. processNextRead + // actually will check if there is a pending read for push for + // the address it's trying to populate. + if (purpose->dest() == ReadDestination::READ_FOR_PUSH) { + cacheBlocks[block_index].lastChangedTick = + curTick() - (Tick) (clockPeriod() / 2); + } else { + cacheBlocks[block_index].lastChangedTick = curTick(); + } - bool cache_wb = false; - if (cacheBlocks[block_index].addr == addr) { - DPRINTF(CoalesceEngine, "%s: Received read response to " - "fill cacheBlocks[%d].\n", __func__, block_index); - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - assert(!cacheBlocks[block_index].valid); - assert(cacheBlocks[block_index].busyMask == 0); - assert(!cacheBlocks[block_index].needsWB); - assert(!cacheBlocks[block_index].needsApply); - assert(cacheBlocks[block_index].pendingData); - assert(!cacheBlocks[block_index].pendingApply); - assert(!cacheBlocks[block_index].pendingWB); - assert(MSHR.find(block_index) != MSHR.end()); - std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize); - for (int i = 0; i < numElementsPerLine; i++) { - DPRINTF(CoalesceEngine, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", - __func__, block_index, i, graphWorkload->printWorkListItem( - cacheBlocks[block_index].items[i])); - } - cacheBlocks[block_index].valid = true; - cacheBlocks[block_index].needsWB |= do_wb; - cacheBlocks[block_index].pendingData = false; - // HACK: In case processNextRead is called on the same tick as curTick - // and is scheduled to read to the same cacheBlocks[block_index] - cacheBlocks[block_index].lastChangedTick = - curTick() - (Tick) (clockPeriod() / 2); - cache_wb = true; - } else if (do_wb) { - PacketPtr wb_pkt = createWritePacket( - addr, peerMemoryAtomSize, (uint8_t*) items); - postPushWBQueue.emplace_back(wb_pkt, curTick()); - memoryFunctionQueue.emplace_back( - [this] (int ignore, Tick schedule_tick) { - processNextPostPushWB(ignore, schedule_tick); - }, 0, curTick()); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); - } - } else { - // TODO: Add a stat to count this. - // FIXME: This is not a totally wasteful read. e.g. all reads - // for pull in BFS are like this. - DPRINTF(CoalesceEngine, "%s: No write destination for addr: %lu.\n", __func__, addr); - } + // NOTE: If the atom is active we have to deactivate the tracking + // of this atom in the memory since it's not in memory anymore. + // Since it is going to the cache, cache will be responsible for + // tracking this. Push to activeCacheBlocks for simulator speed + // instead of having to search for active blocks in the cache. + bool atom_active = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active |= graphWorkload->activeCondition( + cacheBlocks[block_index].items[index]); + } + if (atom_active) { + directory->deactivate(addr); + activeCacheBlocks.push_back(block_index); + } - if (cache_wb) { - for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { - Addr miss_addr = *it; - Addr aligned_miss_addr = - roundDown(miss_addr, peerMemoryAtomSize); + assert(MSHR.find(block_index) != MSHR.end()); + for (auto it = MSHR[block_index].begin(); + it != MSHR[block_index].end();) { + Addr miss_addr = *it; + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); - if (aligned_miss_addr == addr) { + assert(aligned_miss_addr == cacheBlocks[block_index].addr); int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " "cacheBlocks[%d] can be serviced with the received " "packet.\n",__func__, miss_addr, block_index); - // TODO: Make this block of code into a function responseQueue.push_back(std::make_tuple(miss_addr, cacheBlocks[block_index].items[wl_offset], curTick())); DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " @@ -400,32 +411,72 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) graphWorkload->printWorkListItem( cacheBlocks[block_index].items[wl_offset]), responseQueue.size()); - // TODO: Add a stat to count the number of WLItems that have been touched. cacheBlocks[block_index].busyMask |= (1 << wl_offset); - // cacheBlocks[block_index].lastChangedTick = curTick(); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); it = MSHR[block_index].erase(it); + } + MSHR.erase(block_index); + + cacheBlocks[block_index].state = CacheState::BUSY; + if ((!nextResponseEvent.scheduled()) && (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } + delete pkt; + } else { + assert(purpose->dest() == ReadDestination::READ_FOR_PUSH); + // There should be enough room in activeBuffer to place this pkt. + // REMEMBER: If dest == READ_FOR_PUSH we release the reserved space. + // So at this point in code we should have at least one free entry + // in the active buffer which is reserved for this pkt. + assert(activeBuffer.size() + pendingPullReads < activeBufferSize); + + WorkListItem items[numElementsPerLine]; + pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + bool atom_active = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active |= graphWorkload->activeCondition(items[index]); + } + if (atom_active) { + directory->deactivate(addr); + activeBuffer.emplace_back(pkt, curTick()); + DPRINTF(MSDebug, "%s: Empalced pkt: %s in activeBuffer. " + "activeBuffer.size: %d.\n", __func__, + pkt->print(), activeBuffer.size()); } else { - it++; + delete pkt; + } + // if (workLeftInMem() && timeToPull() && canSchedulePull()) { + // memoryFunctionQueue.emplace_back( + // [this] (int ignore, Tick schedule_tick) { + // processNextVertexPull(ignore, schedule_tick); + // }, 0, curTick()); + // if ((!nextMemoryEvent.pending()) && + // (!nextMemoryEvent.scheduled())) { + // schedule(nextMemoryEvent, nextCycle()); + // } + // pullsScheduled++; + // } + if (pullCondition()) { + memoryFunctionQueue.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, 0, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + pullsScheduled++; } } } - if (MSHR[block_index].empty()) { - MSHR.erase(block_index); - } - - if ((!nextResponseEvent.scheduled()) && - (!responseQueue.empty())) { - schedule(nextResponseEvent, nextCycle()); + if (done()) { + owner->recvDoneSignal(); } - - delete pkt; return true; } -// TODO: For loop to empty the entire responseQueue. void CoalesceEngine::processNextResponseEvent() { @@ -450,8 +501,8 @@ CoalesceEngine::processNextResponseEvent() addr_response); responseQueue.pop_front(); - DPRINTF(SEGAStructureSize, "%s: Popped a response from responseQueue. " - "responseQueue.size = %d.\n", __func__, + DPRINTF(SEGAStructureSize, "%s: Popped a response from responseQueue." + " responseQueue.size = %d.\n", __func__, responseQueue.size()); DPRINTF(CoalesceEngine, "%s: Popped a response from responseQueue. " "responseQueue.size = %d.\n", __func__, @@ -491,27 +542,28 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) DPRINTF(CoalesceEngine, "%s: Received a write for WorkListItem: %s " "with Addr: %lu.\n", __func__, graphWorkload->printWorkListItem(wl), addr); - // Desing does not allow for write misses for now. + + // NOTE: Design does not allow for write misses. assert(cacheBlocks[block_index].addr == aligned_addr); // cache state asserts - assert(cacheBlocks[block_index].valid); assert(cacheBlocks[block_index].busyMask != 0); - assert(!cacheBlocks[block_index].pendingData); - assert(!cacheBlocks[block_index].pendingApply); - assert(!cacheBlocks[block_index].pendingWB); + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].state == CacheState::BUSY); // respective bit in busyMask for wl is set. assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) == (1 << wl_offset)); if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) { - cacheBlocks[block_index].needsWB |= true; - stats.numVertexWrites++; + cacheBlocks[block_index].dirty |= true; } cacheBlocks[block_index].items[wl_offset] = wl; - if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) { - cacheBlocks[block_index].needsApply |= true; - cacheBlocks[block_index].needsWB |= true; + if ((graphWorkload->activeCondition(cacheBlocks[block_index].items[wl_offset])) && + (!activeCacheBlocks.find(block_index))) { + activeCacheBlocks.push_back(block_index); + if (!owner->running()) { + owner->start(); + } } cacheBlocks[block_index].busyMask &= ~(1 << wl_offset); @@ -523,188 +575,40 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); - // TODO: Make this more general and programmable. - if ((cacheBlocks[block_index].busyMask == 0)) { - if (cacheBlocks[block_index].needsApply) { - cacheBlocks[block_index].pendingApply = true; - cacheBlocks[block_index].lastChangedTick = curTick(); - applyQueue.push_back(block_index); - DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to " - "applyQueue.\n", __func__, block_index); - if ((!applyQueue.empty()) && - (!nextPreWBApplyEvent.scheduled())) { - schedule(nextPreWBApplyEvent, nextCycle()); - } - } else { - assert(MSHR.size() <= numMSHREntries); - // cache line has conflict. - if (MSHR.find(block_index) != MSHR.end()) { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending " - "conflict.\n", __func__, block_index); - if (cacheBlocks[block_index].needsWB) { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write" - " back.\n", __func__, block_index); - cacheBlocks[block_index].pendingWB = true; - cacheBlocks[block_index].lastChangedTick = curTick(); - memoryFunctionQueue.emplace_back( - [this] (int block_index, Tick schedule_tick) { - processNextWriteBack(block_index, schedule_tick); - }, block_index, curTick()); - DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack " - "for input %d to memoryFunctionQueue.\n", - __func__, block_index); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); - } - } else { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does not need" - " a write back.\n", __func__, block_index); - Addr miss_addr = MSHR[block_index].front(); - Addr aligned_miss_addr = - roundDown(miss_addr, peerMemoryAtomSize); - DPRINTF(CoalesceEngine, "%s: First conflicting address for" - " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n", - __func__, block_index, miss_addr, aligned_miss_addr); - cacheBlocks[block_index].addr = aligned_miss_addr; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].busyMask = 0; - cacheBlocks[block_index].needsWB = false; - cacheBlocks[block_index].needsApply = false; - cacheBlocks[block_index].pendingData = true; - cacheBlocks[block_index].pendingApply = false; - cacheBlocks[block_index].pendingWB = false; - cacheBlocks[block_index].lastChangedTick = curTick(); - memoryFunctionQueue.emplace_back( - [this] (int block_index, Tick schedule_tick) { - processNextRead(block_index, schedule_tick); - }, block_index, curTick()); - DPRINTF(CoalesceEngine, "%s: Pushed processNextRead " - "for input %d to memoryFunctionQueue.\n", - __func__, block_index); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); - } - } - } else { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in " - "idle state now.\n", __func__, block_index); - } - } - } - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - -} - -void -CoalesceEngine::processNextPreWBApplyEvent() -{ - int block_index = preWBApplyQueue.front(); - DPRINTF(CoalesceEngine, "%s: Looking at the front of the preWBApplyQueue. " - "cacheBlock[%d] to be applied.\n", __func__, block_index); - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", - __func__, block_index, cacheBlocks[block_index].to_string()); - - if (cacheBlocks[block_index].state == CacheState::PENDING_PRE_WB_APPLY) { - assert(cacheBlocks[block_index].busyMask == 0); - assert(cacheBlocks[block_index].valid); - assert(cacheBlocks[block_index].needsPreWBApply); - bool block_active = false; - for (int index = 0; index < numElementsPerLine; index++) { - bool active = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]); - block_active |= active; - if (active) { - // cacheWorkCount++; - // FUTUREME: When pulling from activeCacheBlocks, in case we - // face a block that is not in idle state, we basically pop - // that entry and push it to the back. We only delete entries - // in this buffer if pushed or evicted. - activeCacheBlocks.push_back(block_index); - } - } - if (block_active && !owner->running()) { - owner->start(); - } - - cacheBlocks[block_index].needsPreWBApply = false; + if (cacheBlocks[block_index].busyMask == 0) { if (cacheBlocks[block_index].hasConflict) { if (cacheBlocks[block_index].dirty) { + cacheBlocks[block_index].state = CacheState::PENDING_WB; + cacheBlocks[block_index].lastChangedTick = curTick(); memoryFunctionQueue.emplace_back( [this] (int block_index, Tick schedule_tick) { processNextWriteBack(block_index, schedule_tick); }, block_index, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } } else { - // FIXME: Solve below issue. - // Not dirty but could be active still. - // need to activate tracking + bool atom_active = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active |= graphWorkload->activeCondition( + cacheBlocks[block_index].items[index]); + } + if (atom_active) { + activeCacheBlocks.erase(block_index); + directory->activate(cacheBlocks[block_index].addr); + } cacheBlocks[block_index].reset(); } } else { cacheBlocks[block_index].state = CacheState::IDLE; - } - cacheBlocks[block_index].lastChangedTick = curTick(); - } else { - - } - - if (cacheBlocks[block_index].pendingApply) { - assert(cacheBlocks[block_index].busyMask == 0); - for (int index = 0; index < numElementsPerLine; index++) { - bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]); - if (do_push) { - int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr); - if (needsPush[bit_index_base + index] == 0) { - needsPush[bit_index_base + index] = 1; - _workCount++; - activeBits.push_back(bit_index_base + index); - if (!owner->running()) { - owner->start(); - } - } - } - } - stats.bitvectorLength.sample(needsPush.count()); - - assert(cacheBlocks[block_index].needsWB); - cacheBlocks[block_index].needsApply = false; - cacheBlocks[block_index].pendingApply = false; - cacheBlocks[block_index].lastChangedTick = curTick(); - - assert(MSHR.size() <= numMSHREntries); - if (MSHR.find(block_index) != MSHR.end()) { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending " - "conflicts.\n", __func__, block_index); - cacheBlocks[block_index].pendingWB = true; cacheBlocks[block_index].lastChangedTick = curTick(); - memoryFunctionQueue.emplace_back( - [this] (int block_index, Tick schedule_tick) { - processNextWriteBack(block_index, schedule_tick); - }, block_index, curTick()); - DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input" - " %d to memoryFunctionQueue.\n", __func__, block_index); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); - } - } else { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in " - "idle state now.\n", __func__, block_index); } - DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - } else { - stats.numInvalidApplies++; - } - - applyQueue.pop_front(); - if ((!applyQueue.empty()) && - (!nextPreWBApplyEvent.scheduled())) { - schedule(nextPreWBApplyEvent, nextCycle()); } - - if (done()) { + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + stats.numVertexWrites++; + if ((cacheBlocks[block_index].state == CacheState::IDLE) && done()) { owner->recvDoneSignal(); } } @@ -740,6 +644,10 @@ CoalesceEngine::processNextMemoryEvent() if ((!memoryFunctionQueue.empty())) { schedule(nextMemoryEvent, nextCycle()); } + + if (done()) { + owner->recvDoneSignal(); + } } void @@ -759,36 +667,68 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) assert(cacheBlocks[block_index].busyMask == 0); assert(!cacheBlocks[block_index].valid); assert(!cacheBlocks[block_index].dirty); - assert(!cacheBlocks[block_index].needsPreWBApply); assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA); bool need_send_pkt = true; // NOTE: Search postPushWBQueue - for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++) + for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end();) { PacketPtr wb_pkt = std::get<0>(*wb); - if (cacheBlocks[block_index].addr = wb_pkt->getAddr()) { + if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) { wb_pkt->writeDataToBlock( (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + cacheBlocks[block_index].valid = true; cacheBlocks[block_index].dirty = true; + cacheBlocks[block_index].lastChangedTick = curTick(); + need_send_pkt = false; - postPushWBQueue.erase(wb); + wb = postPushWBQueue.erase(wb); + delete wb_pkt; + DPRINTF(MSDebug, "%s: Found addr: %lu in postPushWBQueue. " + "postPushWBQueue.size: %d.\n", __func__, + cacheBlocks[block_index].addr, postPushWBQueue.size()); + } else { + wb++; } } - for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) { + // NOTE: Search activeBuffer + for (auto ab = activeBuffer.begin(); ab != activeBuffer.end();) { PacketPtr ab_pkt = std::get<0>(*ab); - if (cacheBlocks[block_index].addr = ab_pkt->getAddr()) { + if (cacheBlocks[block_index].addr == ab_pkt->getAddr()) { ab_pkt->writeDataToBlock( (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + + cacheBlocks[block_index].valid = true; + cacheBlocks[block_index].dirty = true; + cacheBlocks[block_index].lastChangedTick = curTick(); + activeCacheBlocks.push_back(block_index); + need_send_pkt = false; - activeBuffer.erase(ab); + ab = activeBuffer.erase(ab); + delete ab_pkt; + // if (workLeftInMem() && timeToPull() && canSchedulePull()) { + // memoryFunctionQueue.emplace_back( + // [this] (int ignore, Tick schedule_tick) { + // processNextVertexPull(ignore, schedule_tick); + // }, 0, curTick()); + // pullsScheduled++; + // } + DPRINTF(MSDebug, "%s: Found addr: %lu in activeBuffer. " + "activeBuffer.size: %d.\n", __func__, + cacheBlocks[block_index].addr, activeBuffer.size()); + if (pullCondition()) { + memoryFunctionQueue.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, 0, curTick()); + pullsScheduled++; + } + } else { + ab++; } } if (!need_send_pkt) { - cacheBlocks[block_index].valid = true; - cacheBlocks[block_index].needsPreWBApply = false; - cacheBlocks[block_index].lastChangedTick = curTick(); for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { Addr miss_addr = *it; Addr aligned_miss_addr = @@ -828,14 +768,16 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) cacheBlocks[block_index].state = CacheState::BUSY; } - if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) != - pendingVertexPullReads.end()) { + if (pendingPullAddrs.find(cacheBlocks[block_index].addr) != + pendingPullAddrs.end()) { need_send_pkt = false; } if (need_send_pkt) { PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr, peerMemoryAtomSize); + ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_CACHE); + pkt->pushSenderState(purpose); DPRINTF(CoalesceEngine, "%s: Created a read packet. addr = %lu, " "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); memPort.sendPacket(pkt); @@ -852,25 +794,24 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) block_index, cacheBlocks[block_index].to_string()); if (schedule_tick == cacheBlocks[block_index].lastChangedTick) { - assert(cacheBlocks[block_index].valid); assert(cacheBlocks[block_index].busyMask == 0); + assert(cacheBlocks[block_index].valid); assert(cacheBlocks[block_index].dirty); assert(cacheBlocks[block_index].hasConflict); - assert(!cacheBlocks[block_index].needsPreWBApply); assert(cacheBlocks[block_index].state == CacheState::PENDING_WB); - Addr base_addr = cacheBlocks[block_index].addr; + // NOTE: If the atom we're writing back is active, we have to + // stop tracking it in the cache and start tracking it in the memory. + bool atom_active = false; for (int index = 0; index < numElementsPerLine; index++) { - if (cacheBlocks[block_index].items[index].active) { - Addr vertex_addr = base_addr + index * sizeof(WorkListItem); - // NOTE: Implement this - // workdir.activate() - // cacheWorkCount--; - } + atom_active |= graphWorkload->activeCondition( + cacheBlocks[block_index].items[index]); } - if (activeCacheBlocks.find(block_index)) { + if (atom_active) { activeCacheBlocks.erase(block_index); + directory->activate(cacheBlocks[block_index].addr); } + PacketPtr pkt = createWritePacket( cacheBlocks[block_index].addr, peerMemoryAtomSize, (uint8_t*) cacheBlocks[block_index].items); @@ -878,9 +819,8 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) "Addr: %lu, size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); memPort.sendPacket(pkt); + onTheFlyReqs++; cacheBlocks[block_index].reset(); - DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input" - " %d to memoryFunctionQueue.\n", __func__, block_index); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); } else { @@ -896,94 +836,54 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) void CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick) { + if (postPushWBQueue.empty()) { + return; + } PacketPtr wb_pkt; Tick pkt_tick; std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front(); if (schedule_tick == pkt_tick) { memPort.sendPacket(wb_pkt); + onTheFlyReqs++; postPushWBQueue.pop_front(); + DPRINTF(MSDebug, "%s: Popped pkt: %s from postPushWBQueue. " + "postPushWBQueue.size: %d.\n", __func__, + wb_pkt->print(), postPushWBQueue.size()); } } void CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) { - WorkLocation bit_status; - Addr location; - int offset; - - std::tie(bit_status, location, offset) = getOptimalPullAddr(); - - if (bit_status != WorkLocation::GARBAGE) { - if (bit_status == WorkLocation::PENDING_READ) { - // renaming the outputs to thier local names. - Addr addr = location; - int index_offset = offset; - - uint64_t send_mask = pendingVertexPullReads[addr]; - uint64_t vertex_send_mask = send_mask & (1 << index_offset); - assert(vertex_send_mask == 0); - send_mask |= (1 << index_offset); - pendingVertexPullReads[addr] = send_mask; - numPullsReceived--; + pullsScheduled--; + if (!directory->empty()) { + Addr addr = directory->getNextWork(); + int block_index = getBlockIndex(addr); + + bool in_cache = cacheBlocks[block_index].addr == addr; + bool in_active_buffer = false; + for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) { + PacketPtr pkt = std::get<0>(*ab); + in_active_buffer |= (pkt->getAddr() == addr); } - if (bit_status == WorkLocation::IN_CACHE) { - // renaming the outputs to their local names. - int block_index = (int) location; - int wl_offset = offset; - - Addr addr = cacheBlocks[block_index].addr; - Addr vertex_addr = addr + (wl_offset * sizeof(WorkListItem)); - int slice_base_index = getBitIndexBase(addr); - - needsPush[slice_base_index + wl_offset] = 0; - _workCount--; - - uint32_t delta; - bool do_push, do_wb; - std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply( - cacheBlocks[block_index].items[wl_offset]); - cacheBlocks[block_index].needsWB |= do_wb; - if (do_push) { - owner->recvVertexPush(vertex_addr, delta, - cacheBlocks[block_index].items[wl_offset].edgeIndex, - cacheBlocks[block_index].items[wl_offset].degree); - } else { - DPRINTF(CoalesceEngine, "%s: Fuck!.\n", __func__); - owner->recvPrevPullCorrection(); - } - stats.verticesPushed++; - stats.lastVertexPushTime = curTick() - stats.lastResetTick; - numPullsReceived--; + bool in_write_buffer = false; + for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++) + { + PacketPtr pkt = std::get<0>(*wb); + in_write_buffer |= (pkt->getAddr() == addr); } - if (bit_status == WorkLocation::IN_MEMORY) { - if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) { - Addr addr = location; - int index_offset = offset; - uint64_t send_mask = (1 << index_offset); - assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end()); - PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); - SenderState* sender_state = new SenderState(true); - pkt->pushSenderState(sender_state); - memPort.sendPacket(pkt); - onTheFlyReqs++; - maxPotentialPostPushWB++; - pendingVertexPullReads[addr] = send_mask; - numPullsReceived--; - } + bool repeat_work = pendingPullAddrs.find(addr) != pendingPullAddrs.end(); + + if (!in_cache && !in_active_buffer && !in_write_buffer && !repeat_work) { + PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); + ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_PUSH); + pkt->pushSenderState(purpose); + memPort.sendPacket(pkt); + onTheFlyReqs++; + pendingPullReads++; + pendingPullAddrs.insert(addr); } } - - stats.bitvectorSearchStatus[bit_status]++; - - if (numPullsReceived > 0) { - memoryFunctionQueue.emplace_back( - [this] (int slice_base, Tick schedule_tick) { - processNextVertexPull(slice_base, schedule_tick); - }, 0, curTick()); - DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input " - "0 to memoryFunctionQueue.\n", __func__); - } } void @@ -1000,26 +900,149 @@ CoalesceEngine::recvMemRetry() schedule(nextMemoryEvent, nextCycle()); } +int +CoalesceEngine::workCount() +{ + return activeCacheBlocks.size() + + directory->workCount() + activeBuffer.size(); +} + void CoalesceEngine::recvVertexPull() { - bool should_schedule = (numPullsReceived == 0); - numPullsReceived++; + pullsReceived++; + DPRINTF(CoalesceEngine, "%s: Received a vertex pull. pullsReceived: %d.\n", __func__, pullsReceived); stats.verticesPulled++; stats.lastVertexPullTime = curTick() - stats.lastResetTick; - if (should_schedule) { + if (!nextApplyEvent.scheduled()) { + schedule(nextApplyEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextApplyEvent() +{ + if ((!activeBuffer.empty()) && + (postPushWBQueue.size() < postPushWBQueueSize)) { + PacketPtr pkt; + Tick entrance_tick; + WorkListItem items[numElementsPerLine]; + + std::tie(pkt, entrance_tick) = activeBuffer.front(); + pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + + for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) { + if (graphWorkload->activeCondition(items[index])) { + Addr addr = pkt->getAddr() + index * sizeof(WorkListItem); + uint32_t delta = graphWorkload->apply(items[index]); + owner->recvVertexPush(addr, delta, items[index].edgeIndex, + items[index].degree); + pullsReceived--; + } + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize); + + bool atom_active = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active |= graphWorkload->activeCondition(items[index]); + } + // NOTE: If the atom is not active anymore. + if (!atom_active) { + PacketPtr wb_pkt = createWritePacket(pkt->getAddr(), + peerMemoryAtomSize, (uint8_t*) items); + postPushWBQueue.emplace_back(wb_pkt, curTick()); + DPRINTF(MSDebug, "%s: Empalced pkt: %s in postPushWBQueue. " + "postPushWBQueue.size: %d.\n", __func__, + wb_pkt->print(), postPushWBQueue.size()); + activeBuffer.pop_front(); + DPRINTF(MSDebug, "%s: Popped pkt: %s from activeBuffer. " + "activeBuffer.size: %d.\n", __func__, + pkt->print(), activeBuffer.size()); + memoryFunctionQueue.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextPostPushWB(ignore, schedule_tick); + }, 0, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + delete pkt; + } + } else if (!activeCacheBlocks.empty()) { + int num_visited_indices = 0; + int initial_fifo_length = activeCacheBlocks.size(); + while (true) { + int block_index = activeCacheBlocks.front(); + if (cacheBlocks[block_index].state == CacheState::IDLE) { + for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) { + if (graphWorkload->activeCondition(cacheBlocks[block_index].items[index])) { + Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem); + uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]); + cacheBlocks[block_index].dirty = true; + owner->recvVertexPush(addr, delta, + cacheBlocks[block_index].items[index].edgeIndex, + cacheBlocks[block_index].items[index].degree); + pullsReceived--; + } + } + + bool atom_active = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active |= graphWorkload->activeCondition(cacheBlocks[block_index].items[index]); + } + // NOTE: If we have reached the last item in the cache block + if (!atom_active) { + activeCacheBlocks.erase(block_index); + } + break; + } + // NOTE: If the block with index at the front of activeCacheBlocks + // is not in IDLE state, then roll the that index to the back + activeCacheBlocks.pop_front(); + activeCacheBlocks.push_back(block_index); + // NOTE: If we have visited all the items initially in the FIFO. + num_visited_indices++; + if (num_visited_indices == initial_fifo_length) { + break; + } + } + } else { + DPRINTF(CoalesceEngine, "%s: Could not find " + "work to apply.\n", __func__); + } + + // if (workLeftInMem() && timeToPull() && canSchedulePull()) { + // memoryFunctionQueue.emplace_back( + // [this] (int ignore, Tick schedule_tick) { + // processNextVertexPull(ignore, schedule_tick); + // }, 0, curTick()); + // if ((!nextMemoryEvent.pending()) && + // (!nextMemoryEvent.scheduled())) { + // schedule(nextMemoryEvent, nextCycle()); + // } + // pullsScheduled++; + // } + if (pullCondition()) { memoryFunctionQueue.emplace_back( - [this] (int slice_base, Tick schedule_tick) { - processNextVertexPull(slice_base, schedule_tick); - }, 0, curTick()); + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, 0, curTick()); if ((!nextMemoryEvent.pending()) && (!nextMemoryEvent.scheduled())) { schedule(nextMemoryEvent, nextCycle()); } + pullsScheduled++; + } + + if ((pullsReceived > 0) && (!nextApplyEvent.scheduled())) { + schedule(nextApplyEvent, nextCycle()); } } + CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) : statistics::Group(&_coalesce), coalesce(_coalesce), @@ -1036,16 +1059,11 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "Number of cache hit under misses."), ADD_STAT(mshrEntryShortage, statistics::units::Count::get(), "Number of cache rejections caused by entry shortage."), - ADD_STAT(mshrTargetShortage, statistics::units::Count::get(), - "Number of cache rejections caused by target shortage."), ADD_STAT(responsePortShortage, statistics::units::Count::get(), "Number of times a response has been " "delayed because of port shortage. "), ADD_STAT(numMemoryBlocks, statistics::units::Count::get(), "Number of times memory bandwidth was not available."), - ADD_STAT(numDoubleMemReads, statistics::units::Count::get(), - "Number of times a memory block has been read twice. " - "Once for push and once to populate the cache."), ADD_STAT(verticesPulled, statistics::units::Count::get(), "Number of times a pull request has been sent by PushEngine."), ADD_STAT(verticesPushed, statistics::units::Count::get(), @@ -1054,13 +1072,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "Time of the last pull request. (Relative to reset_stats)"), ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(), "Time of the last vertex push. (Relative to reset_stats)"), - ADD_STAT(numInvalidApplies, statistics::units::Count::get(), - "Number of times a line has become busy" - " while waiting to be applied."), ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(), "Number of times a scheduled memory function has been invalid."), - ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(), - "Distribution for the location of vertex searches."), ADD_STAT(hitRate, statistics::units::Ratio::get(), "Hit rate in the cache."), ADD_STAT(vertexPullBW, statistics::units::Rate - #include "accl/graph/base/data_structs.hh" #include "accl/graph/base/graph_workload.hh" #include "accl/graph/sega/base_memory_engine.hh" +#include "accl/graph/sega/enums.hh" +#include "accl/graph/sega/work_directory.hh" #include "base/cprintf.hh" #include "base/statistics.hh" #include "params/CoalesceEngine.hh" - - namespace gem5 { -enum WorkLocation -{ - PENDING_READ, - IN_CACHE, - IN_MEMORY, - GARBAGE, - NUM_STATUS -}; - -enum CacheState -{ - INVALID, - PENDING_DATA, - BUSY, - IDLE, - PENDING_PRE_WB_APPLY, - PENDING_WB, - PENDING_PRE_PUSH_APPLY, - NUM_CACHE_STATE -}; - -const char* cacheStateStrings[NUM_CACHE_STATE] = { - "INVALID", - "PENDING_DATA", - "BUSY", - "IDLE", - "PENDING_PRE_WB_APPLY", - "PENDING_WB", - "PENDING_PRE_PUSH_APPLY" -}; - -enum ReadDestination -{ - READ_FOR_CACHE, - READ_FOR_PUSH -}; - class MPU; - -// TODO: Add active bit to WorkListItem class. Check active bit before activate -// Only activate if necessary and not active before. -class WorkDirectory -{ - private: - Addr memoryAtomSize; - int atomBlockSize; - size_t elementSize; - - int _workCount; - public: - AddrRange memoryRange; - WorkDirectory(Addr atom_size, int block_size, size_t element_size): - memoryAtomSize(atom_size), atomBlockSize(block_size), - elementSize(element_size), _workCount(0) - {} - - void activate(Addr addr); - void deactivate(Addr addr); - int workCount(); - std::tuple getNextWork(); -}; - class CoalesceEngine : public BaseMemoryEngine { private: @@ -117,7 +54,6 @@ class CoalesceEngine : public BaseMemoryEngine bool valid; bool dirty; bool hasConflict; - bool needsPreWBApply; CacheState state; Tick lastChangedTick; Block() {} @@ -127,7 +63,6 @@ class CoalesceEngine : public BaseMemoryEngine valid(false), dirty(false), hasConflict(false), - needsPreWBApply(false), state(CacheState::INVALID), lastChangedTick(0) { @@ -140,18 +75,15 @@ class CoalesceEngine : public BaseMemoryEngine valid = false; dirty = false; hasConflict = false; - needsPreWBApply = false; state = CacheState::INVALID; lastChangedTick = 0; } std::string to_string() { return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, " - "dirty: %s, hasConflict: %s, needsPreWBApply: %s" - "state: %s, lastChangedTick: %lu}", addr, busyMask, - valid ? "true" : "false", dirty ? "true" : "false", - hasConflict ? "true" : "false", - needsPreWBApply ? "true" : "false", + "dirty: %s, hasConflict: %s, state: %s, lastChangedTick: %lu}", + addr, busyMask, valid ? "true" : "false", + dirty ? "true" : "false", hasConflict ? "true" : "false", cacheStateStrings[state], lastChangedTick); } }; @@ -164,8 +96,11 @@ class CoalesceEngine : public BaseMemoryEngine }; MPU* owner; + WorkDirectory* directory; GraphWorkload* graphWorkload; + Addr lastAtomAddr; + int numLines; int numElementsPerLine; Block* cacheBlocks; @@ -179,26 +114,26 @@ class CoalesceEngine : public BaseMemoryEngine std::deque> responseQueue; // Tracking work in cache - int cacheWorkCount; - int numPullsReceived; - UniqueFIFO preWBApplyQueue; + int pullsReceived; // NOTE: Remember to erase from this upon eviction from cache UniqueFIFO activeCacheBlocks; + int pullsScheduled; int pendingPullReads; // A map from addr to sendMask. sendMask determines which bytes to // send for push when getting the read response from memory. - std::unordered_map pendingVertexPullReads; + std::unordered_set pendingPullAddrs; int activeBufferSize; int postPushWBQueueSize; std::deque> activeBuffer; std::deque> postPushWBQueue; + bool timeToPull(); + bool canSchedulePull(); + bool workLeftInMem(); + bool pullCondition(); int getBlockIndex(Addr addr); - // TODO: Should be moved to WorkDirectory - int getBitIndexBase(Addr addr); - Addr getBlockAddrFromBitIndex(int index); MemoryEvent nextMemoryEvent; void processNextMemoryEvent(); @@ -212,11 +147,8 @@ class CoalesceEngine : public BaseMemoryEngine EventFunctionWrapper nextResponseEvent; void processNextResponseEvent(); - EventFunctionWrapper nextPreWBApplyEvent; - void processNextPreWBApplyEvent(); - - EventFunctionWrapper nextPrePushApplyEvent; - void processNextPrePushApplyEvent(); + EventFunctionWrapper nextApplyEvent; + void processNextApplyEvent(); struct CoalesceStats : public statistics::Group { @@ -236,19 +168,14 @@ class CoalesceEngine : public BaseMemoryEngine statistics::Scalar readMisses; statistics::Scalar readHitUnderMisses; statistics::Scalar mshrEntryShortage; - statistics::Scalar mshrTargetShortage; statistics::Scalar responsePortShortage; statistics::Scalar numMemoryBlocks; - statistics::Scalar numDoubleMemReads; statistics::Scalar verticesPulled; statistics::Scalar verticesPushed; statistics::Scalar lastVertexPullTime; statistics::Scalar lastVertexPushTime; - statistics::Scalar numInvalidApplies; statistics::Scalar numInvalidWriteBacks; - statistics::Vector bitvectorSearchStatus; - statistics::Formula hitRate; statistics::Formula vertexPullBW; statistics::Formula vertexPushBW; @@ -272,12 +199,14 @@ class CoalesceEngine : public BaseMemoryEngine void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; } virtual void recvFunctional(PacketPtr pkt); - bool recvWLRead(Addr addr); + void postMemInitSetup(); + + void createPopCountDirectory(int atoms_per_block); + + ReadReturnStatus recvWLRead(Addr addr); void recvWLWrite(Addr addr, WorkListItem wl); - // FIXME: Update this to return sum of cacheWorkCount and WorkDirectory - // workcount. - int workCount() { return _workCount; } + int workCount(); void recvVertexPull(); bool done(); diff --git a/src/accl/graph/sega/coalesce_engine_bak.cc b/src/accl/graph/sega/coalesce_engine_bak.cc deleted file mode 100644 index 7a064c1c2f..0000000000 --- a/src/accl/graph/sega/coalesce_engine_bak.cc +++ /dev/null @@ -1,1308 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include "accl/graph/sega/coalesce_engine.hh" - -#include - -#include "accl/graph/sega/mpu.hh" -#include "base/intmath.hh" -#include "debug/CacheBlockState.hh" -#include "debug/CoalesceEngine.hh" -#include "debug/SEGAStructureSize.hh" -#include "mem/packet_access.hh" -#include "sim/sim_exit.hh" - -namespace gem5 -{ - -CoalesceEngine::CoalesceEngine(const Params ¶ms): - BaseMemoryEngine(params), - numLines((int) (params.cache_size / peerMemoryAtomSize)), - numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), - onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry), - numTgtsPerMSHR(params.num_tgts_per_mshr), - maxRespPerCycle(params.max_resp_per_cycle), _workCount(0), - numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size), - maxPotentialPostPushWB(0), - nextMemoryEvent([this] { - processNextMemoryEvent(); - }, name() + ".nextMemoryEvent"), - nextResponseEvent([this] { - processNextResponseEvent(); - }, name() + ".nextResponseEvent"), - nextPreWBApplyEvent([this] { - processNextPreWBApplyEvent(); - }, name() + ".nextPreWBApplyEvent"), - stats(*this) -{ - assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine)); - cacheBlocks = new Block [numLines]; - for (int i = 0; i < numLines; i++) { - cacheBlocks[i] = Block(numElementsPerLine); - } - needsPush.reset(); -} - -void -CoalesceEngine::registerMPU(MPU* mpu) -{ - owner = mpu; -} - -void -CoalesceEngine::recvFunctional(PacketPtr pkt) -{ - if (pkt->isRead()) { - assert(pkt->getSize() == peerMemoryAtomSize); - Addr addr = pkt->getAddr(); - int block_index = getBlockIndex(addr); - - if ((cacheBlocks[block_index].addr == addr) && - (cacheBlocks[block_index].valid)) { - assert(cacheBlocks[block_index].busyMask == 0); - assert(!cacheBlocks[block_index].needsApply); - // NOTE: No need to check needsWB because there might be entries - // that have been updated and not written back in the cache. - // assert(!cacheBlocks[block_index].needsWB); - assert(!cacheBlocks[block_index].pendingApply); - assert(!cacheBlocks[block_index].pendingWB); - - pkt->makeResponse(); - pkt->setDataFromBlock( - (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); - } else { - memPort.sendFunctional(pkt); - } - } else { - // TODO: Add and implement init function for GraphWorkload. - int bit_index_base = getBitIndexBase(pkt->getAddr()); - graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount); - memPort.sendFunctional(pkt); - } -} - -bool -CoalesceEngine::done() -{ - return applyQueue.empty() && needsPush.none() && - memoryFunctionQueue.empty() && (onTheFlyReqs == 0); -} - -// addr should be aligned to peerMemoryAtomSize -int -CoalesceEngine::getBlockIndex(Addr addr) -{ - assert((addr % peerMemoryAtomSize) == 0); - Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); - return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines; -} - -// addr should be aligned to peerMemoryAtomSize -int -CoalesceEngine::getBitIndexBase(Addr addr) -{ - assert((addr % peerMemoryAtomSize) == 0); - Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); - int atom_index = (int) (trimmed_addr / peerMemoryAtomSize); - int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem)); - return atom_index * block_bits; -} - -// index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem)) -Addr -CoalesceEngine::getBlockAddrFromBitIndex(int index) -{ - assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0); - Addr trimmed_addr = index * sizeof(WorkListItem); - return peerMemoryRange.addIntlvBits(trimmed_addr); -} - -bool -CoalesceEngine::recvWLRead(Addr addr) -{ - Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); - assert(aligned_addr % peerMemoryAtomSize == 0); - int block_index = getBlockIndex(aligned_addr); - assert(block_index < numLines); - int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); - assert(wl_offset < numElementsPerLine); - DPRINTF(CoalesceEngine, "%s: Received a read request for addr: %lu. " - "This request maps to cacheBlocks[%d], aligned_addr: " - "%lu, and wl_offset: %d.\n", __func__, addr, - block_index, aligned_addr, wl_offset); - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - - if ((cacheBlocks[block_index].addr == aligned_addr) && - (cacheBlocks[block_index].valid)) { - DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit.\n", __func__, addr); - stats.readHits++; - assert(!cacheBlocks[block_index].pendingData); - // No cache block could be in pendingApply and pendingWB at the - // same time. - assert(!(cacheBlocks[block_index].pendingApply && - cacheBlocks[block_index].pendingWB)); - // Hit - // TODO: Add a hit latency as a param for this object. - // Can't just schedule the nextResponseEvent for latency cycles in - // the future. - responseQueue.push_back(std::make_tuple( - addr, cacheBlocks[block_index].items[wl_offset], curTick())); - - DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " - "to responseQueue. responseQueue.size = %d.\n", - __func__, addr, - graphWorkload->printWorkListItem( - cacheBlocks[block_index].items[wl_offset]), - responseQueue.size()); - DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " - "to responseQueue. responseQueue.size = %d.\n", - __func__, addr, - graphWorkload->printWorkListItem( - cacheBlocks[block_index].items[wl_offset]), - responseQueue.size()); - // TODO: Stat to count the number of WLItems that have been touched. - cacheBlocks[block_index].busyMask |= (1 << wl_offset); - // If they are scheduled for apply and WB those schedules should be - // discarded. Since there is no easy way to take items out of the - // function queue. Those functions check for their respective bits - // and skip the process if the respective bit is set to false. - cacheBlocks[block_index].pendingApply = false; - cacheBlocks[block_index].pendingWB = false; - // HACK: If a read happens on the same cycle as another operation such - // as apply set lastChangedTick to half a cycle later so that operation - // scheduled by the original operation (apply in this example) are - // invalidated. For more details refer to "accl/graph/sega/busyMaskErr" - cacheBlocks[block_index].lastChangedTick = - curTick() + (Tick) (clockPeriod() / 2); - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - - if (!nextResponseEvent.scheduled()) { - schedule(nextResponseEvent, nextCycle()); - } - stats.numVertexReads++; - return true; - } else if ((cacheBlocks[block_index].addr == aligned_addr) && - (cacheBlocks[block_index].pendingData)) { - // Hit under miss - DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit under miss.\n", - __func__, addr); - stats.readHitUnderMisses++; - assert(!cacheBlocks[block_index].valid); - assert(cacheBlocks[block_index].busyMask == 0); - assert(!cacheBlocks[block_index].needsWB); - assert(!cacheBlocks[block_index].needsApply); - assert(!cacheBlocks[block_index].pendingApply); - assert(!cacheBlocks[block_index].pendingWB); - - assert(MSHR.size() <= numMSHREntries); - assert(MSHR.find(block_index) != MSHR.end()); - assert(MSHR[block_index].size() <= numTgtsPerMSHR); - if (MSHR[block_index].size() == numTgtsPerMSHR) { - DPRINTF(CoalesceEngine, "%s: Out of targets for " - "cacheBlocks[%d]. Rejecting request.\n", - __func__, block_index); - stats.mshrTargetShortage++; - return false; - } else { - DPRINTF(CoalesceEngine, "%s: MSHR entries are available for " - "cacheBlocks[%d].\n", __func__, block_index); - } - MSHR[block_index].push_back(addr); - stats.mshrEntryLength.sample(MSHR[block_index].size()); - DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " - "for cacheBlocks[%d].\n", __func__, addr, block_index); - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - stats.numVertexReads++; - return true; - } else { - // miss - // FIXME: Make this assert work. It will break if the cache block - // is cold and addr or aligned_addr is 0. It fails because cache block - // addr field is initialized to 0. Unfortunately Addr type is unsigned. - // So you can not initialized addr to -1. - assert(cacheBlocks[block_index].addr != aligned_addr); - assert(MSHR.size() <= numMSHREntries); - DPRINTF(CoalesceEngine, "%s: Addr: %lu is a miss.\n", __func__, addr); - if (MSHR.find(block_index) == MSHR.end()) { - DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for Addr:" - " %lu not found in MSHRs.\n", __func__, block_index, addr); - if (MSHR.size() == numMSHREntries) { - // Out of MSHR entries - DPRINTF(CoalesceEngine, "%s: Out of MSHR entries. " - "Rejecting request.\n", __func__); - // TODO: Break out read rejections into more than one stat - // based on the cause of the rejection - stats.mshrEntryShortage++; - return false; - } else { - DPRINTF(CoalesceEngine, "%s: MSHR " - "entries available.\n", __func__); - if ((cacheBlocks[block_index].valid) || - (cacheBlocks[block_index].pendingData)) { - DPRINTF(CoalesceEngine, "%s: Addr: %lu has a conflict " - "with Addr: %lu.\n", __func__, addr, - cacheBlocks[block_index].addr); - if ((cacheBlocks[block_index].valid) && - (cacheBlocks[block_index].busyMask == 0) && - (!cacheBlocks[block_index].pendingApply) && - (!cacheBlocks[block_index].pendingWB)) { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in " - "idle state.\n", __func__, block_index); - // We're in idle state - // Idle: valid && !pendingApply && !pendingWB; - // Note 0: needsApply has to be false. Because - // A cache line enters the idle state from two - // other states. First a busy state that does not - // need apply (needsApply is already false) or - // from pendingApplyState after being applied which - // clears the needsApply bit. needsApply is useful - // when a cache block has transitioned from - // pendingApply to busy without the apply happening. - // Note 1: pendingData does not have to be evaluated - // becuase pendingData is cleared when data - // arrives from the memory and valid does not - // denote cleanliness of the line. Rather it - // is used to differentiate between empty blocks - // and the blocks that have data from memory. - // pendingData denotes the transient state between - // getting a miss and getting the data for that miss. - // valid basically means that the data in the cache - // could be used to respond to read/write requests. - assert(!cacheBlocks[block_index].needsApply); - assert(!cacheBlocks[block_index].pendingData); - // There are no conflicts in idle state. - assert(MSHR.find(block_index) == MSHR.end()); - if (cacheBlocks[block_index].needsWB) { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs" - "to be written back.\n", __func__, block_index); - cacheBlocks[block_index].pendingWB = true; - cacheBlocks[block_index].lastChangedTick = curTick(); - memoryFunctionQueue.emplace_back( - [this] (int block_index, Tick schedule_tick) { - processNextWriteBack(block_index, schedule_tick); - }, block_index, curTick()); - DPRINTF(CoalesceEngine, "%s: Pushed " - "processNextWriteBack for input " - "%d to memoryFunctionQueue.\n", - __func__, block_index); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); - } - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: " - "%s.\n", __func__, block_index, - cacheBlocks[block_index].to_string()); - } else { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does " - "not need to be written back.\n", - __func__, block_index); - cacheBlocks[block_index].addr = aligned_addr; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].busyMask = 0; - cacheBlocks[block_index].needsWB = false; - cacheBlocks[block_index].needsApply = false; - cacheBlocks[block_index].pendingData = true; - cacheBlocks[block_index].pendingApply = false; - cacheBlocks[block_index].pendingWB = false; - cacheBlocks[block_index].lastChangedTick = curTick(); - memoryFunctionQueue.emplace_back( - [this] (int block_index, Tick schedule_tick) { - processNextRead(block_index, schedule_tick); - }, block_index, curTick()); - DPRINTF(CoalesceEngine, "%s: Pushed " - "processNextRead for input " - "%d to memoryFunctionQueue.\n", - __func__, block_index); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); - } - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: " - "%s.\n", __func__, block_index, - cacheBlocks[block_index].to_string()); - } - } - // cacheBlocks[block_index].hasConflict = true; - MSHR[block_index].push_back(addr); - stats.mshrEntryLength.sample(MSHR[block_index].size()); - DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " - "for cacheBlocks[%d].\n", __func__, addr, block_index); - stats.readMisses++; - // TODO: Add readConflicts here. - stats.numVertexReads++; - return true; - } else { - // MSHR available and no conflict - DPRINTF(CoalesceEngine, "%s: Addr: %lu has no conflict. " - "Allocating a cache line for it.\n" - , __func__, addr); - assert(!cacheBlocks[block_index].valid); - assert(cacheBlocks[block_index].busyMask == 0); - assert(!cacheBlocks[block_index].needsWB); - assert(!cacheBlocks[block_index].needsApply); - assert(!cacheBlocks[block_index].pendingData); - assert(!cacheBlocks[block_index].pendingApply); - assert(!cacheBlocks[block_index].pendingWB); - assert(MSHR[block_index].size() == 0); - - cacheBlocks[block_index].addr = aligned_addr; - cacheBlocks[block_index].busyMask = 0; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].needsWB = false; - cacheBlocks[block_index].needsApply = false; - cacheBlocks[block_index].pendingData = true; - cacheBlocks[block_index].pendingApply = false; - cacheBlocks[block_index].pendingWB = false; - cacheBlocks[block_index].lastChangedTick = curTick(); - DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for" - " Addr: %lu.\n", __func__, block_index, addr); - MSHR[block_index].push_back(addr); - stats.mshrEntryLength.sample(MSHR[block_index].size()); - DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets " - "for cacheBlocks[%d].\n", __func__, addr, block_index); - memoryFunctionQueue.emplace_back( - [this] (int block_index, Tick schedule_tick) { - processNextRead(block_index, schedule_tick); - }, block_index, curTick()); - DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for " - "input %d to memoryFunctionQueue.\n", - __func__, block_index); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); - } - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", - __func__, block_index, - cacheBlocks[block_index].to_string()); - stats.readMisses++; - stats.numVertexReads++; - return true; - } - } - } else { - DPRINTF(CoalesceEngine, "%s: Respective cacheBlocks[%d] for " - "Addr: %lu already in MSHRs. It has a conflict " - "with addr: %lu.\n", __func__, block_index, addr, - cacheBlocks[block_index].addr); - assert(MSHR[block_index].size() <= numTgtsPerMSHR); - assert(MSHR[block_index].size() > 0); - if (MSHR[block_index].size() == numTgtsPerMSHR) { - DPRINTF(CoalesceEngine, "%s: Out of targets for " - "cacheBlocks[%d]. Rejecting request.\n", - __func__, block_index); - stats.mshrTargetShortage++; - return false; - } - DPRINTF(CoalesceEngine, "%s: There is room for another target " - "for cacheBlocks[%d].\n", __func__, block_index); - - // TODO: Might want to differentiate between different misses. - stats.readMisses++; - - MSHR[block_index].push_back(addr); - stats.mshrEntryLength.sample(MSHR[block_index].size()); - DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets for " - "cacheBlocks[%d].\n", __func__, addr, block_index); - stats.numVertexReads++; - return true; - } - } -} - -bool -CoalesceEngine::handleMemResp(PacketPtr pkt) -{ - assert(pkt->isResponse()); - DPRINTF(CoalesceEngine, "%s: Received packet: %s from memory.\n", - __func__, pkt->print()); - if (pkt->isWrite()) { - DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__); - delete pkt; - return true; - } - - onTheFlyReqs--; - Addr addr = pkt->getAddr(); - int block_index = getBlockIndex(addr); - WorkListItem* items = pkt->getPtr(); - - bool do_wb = false; - if (pkt->findNextSenderState()) { - assert(!((cacheBlocks[block_index].addr == addr) && - (cacheBlocks[block_index].valid))); - // We have read the address to send the wl and it is not in the - // cache. Simply send the items to the PushEngine. - - DPRINTF(CoalesceEngine, "%s: Received read response for pull read " - "for addr %lu.\n", __func__, addr); - int it = getBitIndexBase(addr); - uint64_t send_mask = pendingVertexPullReads[addr]; - // No applying of the line needed. - for (int i = 0; i < numElementsPerLine; i++) { - Addr vertex_addr = addr + i * sizeof(WorkListItem); - uint64_t vertex_send_mask = send_mask & (1 << i); - if (vertex_send_mask != 0) { - assert(needsPush[it + i] == 1); - needsPush[it + i] = 0; - _workCount--; - - uint32_t delta; - bool do_push, do_wb_v; - std::tie(delta, do_push, do_wb_v) = - graphWorkload->prePushApply(items[i]); - do_wb |= do_wb_v; - if (do_push) { - owner->recvVertexPush(vertex_addr, delta, - items[i].edgeIndex, items[i].degree); - } else { - // TODO: Add a stat to count this. - owner->recvPrevPullCorrection(); - } - stats.verticesPushed++; - stats.lastVertexPushTime = curTick() - stats.lastResetTick; - } - } - pendingVertexPullReads.erase(addr); - maxPotentialPostPushWB--; - } - - bool cache_wb = false; - if (cacheBlocks[block_index].addr == addr) { - DPRINTF(CoalesceEngine, "%s: Received read response to " - "fill cacheBlocks[%d].\n", __func__, block_index); - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - assert(!cacheBlocks[block_index].valid); - assert(cacheBlocks[block_index].busyMask == 0); - assert(!cacheBlocks[block_index].needsWB); - assert(!cacheBlocks[block_index].needsApply); - assert(cacheBlocks[block_index].pendingData); - assert(!cacheBlocks[block_index].pendingApply); - assert(!cacheBlocks[block_index].pendingWB); - assert(MSHR.find(block_index) != MSHR.end()); - std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize); - for (int i = 0; i < numElementsPerLine; i++) { - DPRINTF(CoalesceEngine, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", - __func__, block_index, i, graphWorkload->printWorkListItem( - cacheBlocks[block_index].items[i])); - } - cacheBlocks[block_index].valid = true; - cacheBlocks[block_index].needsWB |= do_wb; - cacheBlocks[block_index].pendingData = false; - // HACK: In case processNextRead is called on the same tick as curTick - // and is scheduled to read to the same cacheBlocks[block_index] - cacheBlocks[block_index].lastChangedTick = - curTick() - (Tick) (clockPeriod() / 2); - cache_wb = true; - } else if (do_wb) { - PacketPtr wb_pkt = createWritePacket( - addr, peerMemoryAtomSize, (uint8_t*) items); - postPushWBQueue.emplace_back(wb_pkt, curTick()); - memoryFunctionQueue.emplace_back( - [this] (int ignore, Tick schedule_tick) { - processNextPostPushWB(ignore, schedule_tick); - }, 0, curTick()); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); - } - } else { - // TODO: Add a stat to count this. - // FIXME: This is not a totally wasteful read. e.g. all reads - // for pull in BFS are like this. - DPRINTF(CoalesceEngine, "%s: No write destination for addr: %lu.\n", __func__, addr); - } - - if (cache_wb) { - for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { - Addr miss_addr = *it; - Addr aligned_miss_addr = - roundDown(miss_addr, peerMemoryAtomSize); - - if (aligned_miss_addr == addr) { - int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); - DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " - "cacheBlocks[%d] can be serviced with the received " - "packet.\n",__func__, miss_addr, block_index); - // TODO: Make this block of code into a function - responseQueue.push_back(std::make_tuple(miss_addr, - cacheBlocks[block_index].items[wl_offset], curTick())); - DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " - "to responseQueue. responseQueue.size = %d.\n", - __func__, miss_addr, - graphWorkload->printWorkListItem( - cacheBlocks[block_index].items[wl_offset]), - responseQueue.size()); - DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " - "to responseQueue. responseQueue.size = %d.\n", - __func__, addr, - graphWorkload->printWorkListItem( - cacheBlocks[block_index].items[wl_offset]), - responseQueue.size()); - // TODO: Add a stat to count the number of WLItems that have been touched. - cacheBlocks[block_index].busyMask |= (1 << wl_offset); - // cacheBlocks[block_index].lastChangedTick = curTick(); - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - it = MSHR[block_index].erase(it); - } else { - it++; - } - } - } - - if (MSHR[block_index].empty()) { - MSHR.erase(block_index); - } - - if ((!nextResponseEvent.scheduled()) && - (!responseQueue.empty())) { - schedule(nextResponseEvent, nextCycle()); - } - - - // TODO: Probably check for done here too. - delete pkt; - return true; -} - -// TODO: For loop to empty the entire responseQueue. -void -CoalesceEngine::processNextResponseEvent() -{ - int num_responses_sent = 0; - - Addr addr_response; - WorkListItem worklist_response; - Tick response_queueing_tick; - while(true) { - std::tie(addr_response, worklist_response, response_queueing_tick) = - responseQueue.front(); - Tick waiting_ticks = curTick() - response_queueing_tick; - if (ticksToCycles(waiting_ticks) < 1) { - break; - } - owner->handleIncomingWL(addr_response, worklist_response); - num_responses_sent++; - DPRINTF(CoalesceEngine, - "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n", - __func__, - graphWorkload->printWorkListItem(worklist_response), - addr_response); - - responseQueue.pop_front(); - DPRINTF(SEGAStructureSize, "%s: Popped a response from responseQueue. " - "responseQueue.size = %d.\n", __func__, - responseQueue.size()); - DPRINTF(CoalesceEngine, "%s: Popped a response from responseQueue. " - "responseQueue.size = %d.\n", __func__, - responseQueue.size()); - stats.responseQueueLatency.sample( - waiting_ticks * 1e9 / getClockFrequency()); - if (num_responses_sent >= maxRespPerCycle) { - if (!responseQueue.empty()) { - stats.responsePortShortage++; - } - break; - } - if (responseQueue.empty()) { - break; - } - } - - if ((!nextResponseEvent.scheduled()) && - (!responseQueue.empty())) { - schedule(nextResponseEvent, nextCycle()); - } -} - -void -CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) -{ - Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); - int block_index = getBlockIndex(aligned_addr); - int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); - DPRINTF(CoalesceEngine, "%s: Received a write request for addr: %lu with " - "wl: %s. This request maps to cacheBlocks[%d], " - "aligned_addr: %lu, and wl_offset: %d.\n", - __func__, addr, graphWorkload->printWorkListItem(wl), - block_index, aligned_addr, wl_offset); - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - DPRINTF(CoalesceEngine, "%s: Received a write for WorkListItem: %s " - "with Addr: %lu.\n", __func__, - graphWorkload->printWorkListItem(wl), addr); - // Desing does not allow for write misses for now. - assert(cacheBlocks[block_index].addr == aligned_addr); - // cache state asserts - assert(cacheBlocks[block_index].valid); - assert(cacheBlocks[block_index].busyMask != 0); - assert(!cacheBlocks[block_index].pendingData); - assert(!cacheBlocks[block_index].pendingApply); - assert(!cacheBlocks[block_index].pendingWB); - - // respective bit in busyMask for wl is set. - assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) == - (1 << wl_offset)); - - if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) { - cacheBlocks[block_index].needsWB |= true; - stats.numVertexWrites++; - } - cacheBlocks[block_index].items[wl_offset] = wl; - if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) { - cacheBlocks[block_index].needsApply |= true; - cacheBlocks[block_index].needsWB |= true; - } - - cacheBlocks[block_index].busyMask &= ~(1 << wl_offset); - cacheBlocks[block_index].lastChangedTick = curTick(); - DPRINTF(CoalesceEngine, "%s: Wrote to cacheBlocks[%d][%d] = %s.\n", - __func__, block_index, wl_offset, - graphWorkload->printWorkListItem( - cacheBlocks[block_index].items[wl_offset])); - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - - // TODO: Make this more general and programmable. - if ((cacheBlocks[block_index].busyMask == 0)) { - if (cacheBlocks[block_index].needsApply) { - cacheBlocks[block_index].pendingApply = true; - cacheBlocks[block_index].lastChangedTick = curTick(); - applyQueue.push_back(block_index); - DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to " - "applyQueue.\n", __func__, block_index); - if ((!applyQueue.empty()) && - (!nextPreWBApplyEvent.scheduled())) { - schedule(nextPreWBApplyEvent, nextCycle()); - } - } else { - assert(MSHR.size() <= numMSHREntries); - // cache line has conflict. - if (MSHR.find(block_index) != MSHR.end()) { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending " - "conflict.\n", __func__, block_index); - if (cacheBlocks[block_index].needsWB) { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write" - " back.\n", __func__, block_index); - cacheBlocks[block_index].pendingWB = true; - cacheBlocks[block_index].lastChangedTick = curTick(); - memoryFunctionQueue.emplace_back( - [this] (int block_index, Tick schedule_tick) { - processNextWriteBack(block_index, schedule_tick); - }, block_index, curTick()); - DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack " - "for input %d to memoryFunctionQueue.\n", - __func__, block_index); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); - } - } else { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does not need" - " a write back.\n", __func__, block_index); - Addr miss_addr = MSHR[block_index].front(); - Addr aligned_miss_addr = - roundDown(miss_addr, peerMemoryAtomSize); - DPRINTF(CoalesceEngine, "%s: First conflicting address for" - " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n", - __func__, block_index, miss_addr, aligned_miss_addr); - cacheBlocks[block_index].addr = aligned_miss_addr; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].busyMask = 0; - cacheBlocks[block_index].needsWB = false; - cacheBlocks[block_index].needsApply = false; - cacheBlocks[block_index].pendingData = true; - cacheBlocks[block_index].pendingApply = false; - cacheBlocks[block_index].pendingWB = false; - cacheBlocks[block_index].lastChangedTick = curTick(); - memoryFunctionQueue.emplace_back( - [this] (int block_index, Tick schedule_tick) { - processNextRead(block_index, schedule_tick); - }, block_index, curTick()); - DPRINTF(CoalesceEngine, "%s: Pushed processNextRead " - "for input %d to memoryFunctionQueue.\n", - __func__, block_index); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); - } - } - } else { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in " - "idle state now.\n", __func__, block_index); - } - } - } - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - -} - -void -CoalesceEngine::processNextPreWBApplyEvent() -{ - int block_index = applyQueue.front(); - DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. " - "cacheBlock[%d] to be applied.\n", __func__, block_index); - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", - __func__, block_index, cacheBlocks[block_index].to_string()); - assert(cacheBlocks[block_index].valid); - assert(cacheBlocks[block_index].needsApply); - assert(!cacheBlocks[block_index].pendingData); - assert(!cacheBlocks[block_index].pendingWB); - - if (cacheBlocks[block_index].pendingApply) { - assert(cacheBlocks[block_index].busyMask == 0); - for (int index = 0; index < numElementsPerLine; index++) { - bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]); - if (do_push) { - int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr); - if (needsPush[bit_index_base + index] == 0) { - needsPush[bit_index_base + index] = 1; - _workCount++; - activeBits.push_back(bit_index_base + index); - if (!owner->running()) { - owner->start(); - } - } - } - } - stats.bitvectorLength.sample(needsPush.count()); - - assert(cacheBlocks[block_index].needsWB); - cacheBlocks[block_index].needsApply = false; - cacheBlocks[block_index].pendingApply = false; - cacheBlocks[block_index].lastChangedTick = curTick(); - - assert(MSHR.size() <= numMSHREntries); - if (MSHR.find(block_index) != MSHR.end()) { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending " - "conflicts.\n", __func__, block_index); - cacheBlocks[block_index].pendingWB = true; - cacheBlocks[block_index].lastChangedTick = curTick(); - memoryFunctionQueue.emplace_back( - [this] (int block_index, Tick schedule_tick) { - processNextWriteBack(block_index, schedule_tick); - }, block_index, curTick()); - DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input" - " %d to memoryFunctionQueue.\n", __func__, block_index); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); - } - } else { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in " - "idle state now.\n", __func__, block_index); - } - DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - } else { - stats.numInvalidApplies++; - } - - applyQueue.pop_front(); - if ((!applyQueue.empty()) && - (!nextPreWBApplyEvent.scheduled())) { - schedule(nextPreWBApplyEvent, nextCycle()); - } - - if (done()) { - owner->recvDoneSignal(); - } -} - -void -CoalesceEngine::processNextMemoryEvent() -{ - if (memPort.blocked()) { - stats.numMemoryBlocks++; - nextMemoryEvent.sleep(); - return; - } - - DPRINTF(CoalesceEngine, "%s: Processing another " - "memory function.\n", __func__); - std::function next_memory_function; - int next_memory_function_input; - Tick next_memory_function_tick; - std::tie( - next_memory_function, - next_memory_function_input, - next_memory_function_tick) = memoryFunctionQueue.front(); - next_memory_function(next_memory_function_input, next_memory_function_tick); - memoryFunctionQueue.pop_front(); - stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick) - * 1e9 / getClockFrequency()); - DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. " - "memoryFunctionQueue.size = %d.\n", __func__, - memoryFunctionQueue.size()); - - assert(!nextMemoryEvent.pending()); - assert(!nextMemoryEvent.scheduled()); - if ((!memoryFunctionQueue.empty())) { - schedule(nextMemoryEvent, nextCycle()); - } -} - -void -CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) -{ - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n", - __func__, block_index); - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", - __func__, block_index, cacheBlocks[block_index].to_string()); - // A cache block should not be touched while it's waiting for data. - // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick); - - if (cacheBlocks[block_index].lastChangedTick != schedule_tick) { - return; - } - - assert(!cacheBlocks[block_index].valid); - assert(cacheBlocks[block_index].busyMask == 0); - assert(!cacheBlocks[block_index].needsWB); - assert(!cacheBlocks[block_index].needsApply); - assert(cacheBlocks[block_index].pendingData); - assert(!cacheBlocks[block_index].pendingApply); - assert(!cacheBlocks[block_index].pendingWB); - - bool need_send_pkt = true; - for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++) - { - PacketPtr wb_pkt = std::get<0>(*wb); - if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) { - wb_pkt->writeDataToBlock( - (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); - cacheBlocks[block_index].needsWB = true; - for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { - Addr miss_addr = *it; - Addr aligned_miss_addr = - roundDown(miss_addr, peerMemoryAtomSize); - - if (aligned_miss_addr == cacheBlocks[block_index].addr) { - int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); - DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " - "cacheBlocks[%d] can be serviced with the received " - "packet.\n",__func__, miss_addr, block_index); - // TODO: Make this block of code into a function - responseQueue.push_back(std::make_tuple(miss_addr, - cacheBlocks[block_index].items[wl_offset], curTick())); - DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " - "to responseQueue. responseQueue.size = %d.\n", - __func__, miss_addr, - graphWorkload->printWorkListItem( - cacheBlocks[block_index].items[wl_offset]), - responseQueue.size()); - DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " - "to responseQueue. responseQueue.size = %d.\n", - __func__, miss_addr, - graphWorkload->printWorkListItem( - cacheBlocks[block_index].items[wl_offset]), - responseQueue.size()); - // TODO: Add a stat to count the number of WLItems that have been touched. - cacheBlocks[block_index].busyMask |= (1 << wl_offset); - cacheBlocks[block_index].lastChangedTick = curTick(); - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - it = MSHR[block_index].erase(it); - } else { - it++; - } - } - if (MSHR[block_index].empty()) { - MSHR.erase(block_index); - } - - if ((!nextResponseEvent.scheduled()) && - (!responseQueue.empty())) { - schedule(nextResponseEvent, nextCycle()); - } - postPushWBQueue.erase(wb); - need_send_pkt = false; - } - } - - if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) != - pendingVertexPullReads.end()) { - need_send_pkt = false; - } - - if (need_send_pkt) { - PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr, - peerMemoryAtomSize); - DPRINTF(CoalesceEngine, "%s: Created a read packet. addr = %lu, " - "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); - memPort.sendPacket(pkt); - onTheFlyReqs++; - - if (pendingVertexPullReads.find(pkt->getAddr()) != - pendingVertexPullReads.end()) { - stats.numDoubleMemReads++; - } - } -} - -void -CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) -{ - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n", - __func__, block_index); - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - if (schedule_tick == cacheBlocks[block_index].lastChangedTick) { - assert(cacheBlocks[block_index].valid); - assert(cacheBlocks[block_index].busyMask == 0); - assert(cacheBlocks[block_index].needsWB); - assert(!cacheBlocks[block_index].needsApply); - assert(!cacheBlocks[block_index].pendingData); - assert(!cacheBlocks[block_index].pendingApply); - assert(cacheBlocks[block_index].pendingWB); - - // Why would we write it back if it does not have a conflict. - assert(MSHR.size() <= numMSHREntries); - assert(MSHR.find(block_index) != MSHR.end()); - - PacketPtr pkt = createWritePacket( - cacheBlocks[block_index].addr, peerMemoryAtomSize, - (uint8_t*) cacheBlocks[block_index].items); - DPRINTF(CoalesceEngine, "%s: Created a write packet to " - "Addr: %lu, size = %d.\n", __func__, - pkt->getAddr(), pkt->getSize()); - memPort.sendPacket(pkt); - // onTheFlyReqs++; - cacheBlocks[block_index].needsWB = false; - cacheBlocks[block_index].pendingWB = false; - - Addr miss_addr = MSHR[block_index].front(); - Addr aligned_miss_addr = - roundDown(miss_addr, peerMemoryAtomSize); - DPRINTF(CoalesceEngine, "%s: First conflicting address for" - " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n", - __func__, block_index, miss_addr, aligned_miss_addr); - - cacheBlocks[block_index].addr = aligned_miss_addr; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].busyMask = 0; - cacheBlocks[block_index].needsWB = false; - cacheBlocks[block_index].needsApply = false; - cacheBlocks[block_index].pendingData = true; - cacheBlocks[block_index].pendingApply = false; - cacheBlocks[block_index].pendingWB = false; - cacheBlocks[block_index].lastChangedTick = curTick(); - memoryFunctionQueue.emplace_back( - [this] (int block_index, Tick schedule_tick) { - processNextRead(block_index, schedule_tick); - }, block_index, curTick()); - DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input" - " %d to memoryFunctionQueue.\n", __func__, block_index); - DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, - block_index, cacheBlocks[block_index].to_string()); - } else { - DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a " - "write back has been scheduled for it. Ignoring " - "the current write back scheduled at tick %lu for " - "the right function scheduled later.\n", - __func__, block_index, schedule_tick); - stats.numInvalidWriteBacks++; - } -} - -void -CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick) -{ - PacketPtr wb_pkt; - Tick pkt_tick; - std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front(); - if (schedule_tick == pkt_tick) { - memPort.sendPacket(wb_pkt); - postPushWBQueue.pop_front(); - } -} - -std::tuple -CoalesceEngine::getOptimalPullAddr() -{ - int visited_bits = 0; - int num_intial_active_bits = activeBits.size(); - while (visited_bits < num_intial_active_bits) { - int index = activeBits.front(); - int base_index = roundDown(index, numElementsPerLine); - int index_offset = index - base_index; - assert(needsPush[index] == 1); - assert(index_offset < numElementsPerLine); - - Addr addr = getBlockAddrFromBitIndex(base_index); - int block_index = getBlockIndex(addr); - if (pendingVertexPullReads.find(addr) != pendingVertexPullReads.end()) - { - uint64_t send_mask = pendingVertexPullReads[addr]; - uint64_t vertex_send_mask = send_mask & (1 << index_offset); - assert(vertex_send_mask == 0); - activeBits.pop_front(); - return std::make_tuple( - WorkLocation::PENDING_READ, addr, index_offset); - } else { - // Only if it is in cache and it is in idle state. - if ((cacheBlocks[block_index].addr == addr) && - (cacheBlocks[block_index].valid) && - (cacheBlocks[block_index].busyMask == 0) && - (!cacheBlocks[block_index].pendingApply) && - (!cacheBlocks[block_index].pendingWB)) { - assert(!cacheBlocks[block_index].needsApply); - assert(!cacheBlocks[block_index].pendingData); - activeBits.pop_front(); - return std::make_tuple( - WorkLocation::IN_CACHE, block_index, index_offset); - // Otherwise if it is in memory - } else if ((cacheBlocks[block_index].addr != addr)) { - activeBits.pop_front(); - return std::make_tuple( - WorkLocation::IN_MEMORY, addr, index_offset); - } - } - activeBits.pop_front(); - activeBits.push_back(index); - visited_bits++; - } - - return std::make_tuple(WorkLocation::GARBAGE, 0, 0); -} - -void -CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) -{ - WorkLocation bit_status; - Addr location; - int offset; - - std::tie(bit_status, location, offset) = getOptimalPullAddr(); - - if (bit_status != WorkLocation::GARBAGE) { - if (bit_status == WorkLocation::PENDING_READ) { - // renaming the outputs to thier local names. - Addr addr = location; - int index_offset = offset; - - uint64_t send_mask = pendingVertexPullReads[addr]; - uint64_t vertex_send_mask = send_mask & (1 << index_offset); - assert(vertex_send_mask == 0); - send_mask |= (1 << index_offset); - pendingVertexPullReads[addr] = send_mask; - numPullsReceived--; - } - if (bit_status == WorkLocation::IN_CACHE) { - // renaming the outputs to their local names. - int block_index = (int) location; - int wl_offset = offset; - - Addr addr = cacheBlocks[block_index].addr; - Addr vertex_addr = addr + (wl_offset * sizeof(WorkListItem)); - int slice_base_index = getBitIndexBase(addr); - - needsPush[slice_base_index + wl_offset] = 0; - _workCount--; - - uint32_t delta; - bool do_push, do_wb; - std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply( - cacheBlocks[block_index].items[wl_offset]); - cacheBlocks[block_index].needsWB |= do_wb; - if (do_push) { - owner->recvVertexPush(vertex_addr, delta, - cacheBlocks[block_index].items[wl_offset].edgeIndex, - cacheBlocks[block_index].items[wl_offset].degree); - } else { - DPRINTF(CoalesceEngine, "%s: Fuck!.\n", __func__); - owner->recvPrevPullCorrection(); - } - stats.verticesPushed++; - stats.lastVertexPushTime = curTick() - stats.lastResetTick; - numPullsReceived--; - } - if (bit_status == WorkLocation::IN_MEMORY) { - if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) { - Addr addr = location; - int index_offset = offset; - uint64_t send_mask = (1 << index_offset); - assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end()); - PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); - SenderState* sender_state = new SenderState(true); - pkt->pushSenderState(sender_state); - memPort.sendPacket(pkt); - onTheFlyReqs++; - maxPotentialPostPushWB++; - pendingVertexPullReads[addr] = send_mask; - numPullsReceived--; - } - } - } - - stats.bitvectorSearchStatus[bit_status]++; - - if (numPullsReceived > 0) { - memoryFunctionQueue.emplace_back( - [this] (int slice_base, Tick schedule_tick) { - processNextVertexPull(slice_base, schedule_tick); - }, 0, curTick()); - DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input " - "0 to memoryFunctionQueue.\n", __func__); - } -} - -void -CoalesceEngine::recvMemRetry() -{ - DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__); - - if (!nextMemoryEvent.pending()) { - DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__); - return; - } - assert(!nextMemoryEvent.scheduled()); - nextMemoryEvent.wake(); - schedule(nextMemoryEvent, nextCycle()); -} - -void -CoalesceEngine::recvVertexPull() -{ - bool should_schedule = (numPullsReceived == 0); - numPullsReceived++; - - stats.verticesPulled++; - stats.lastVertexPullTime = curTick() - stats.lastResetTick; - if (should_schedule) { - memoryFunctionQueue.emplace_back( - [this] (int slice_base, Tick schedule_tick) { - processNextVertexPull(slice_base, schedule_tick); - }, 0, curTick()); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); - } - } -} - -CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) - : statistics::Group(&_coalesce), - coalesce(_coalesce), - lastResetTick(0), - ADD_STAT(numVertexReads, statistics::units::Count::get(), - "Number of memory vertecies read from cache."), - ADD_STAT(numVertexWrites, statistics::units::Count::get(), - "Number of memory vertecies written to cache."), - ADD_STAT(readHits, statistics::units::Count::get(), - "Number of cache hits."), - ADD_STAT(readMisses, statistics::units::Count::get(), - "Number of cache misses."), - ADD_STAT(readHitUnderMisses, statistics::units::Count::get(), - "Number of cache hit under misses."), - ADD_STAT(mshrEntryShortage, statistics::units::Count::get(), - "Number of cache rejections caused by entry shortage."), - ADD_STAT(mshrTargetShortage, statistics::units::Count::get(), - "Number of cache rejections caused by target shortage."), - ADD_STAT(responsePortShortage, statistics::units::Count::get(), - "Number of times a response has been " - "delayed because of port shortage. "), - ADD_STAT(numMemoryBlocks, statistics::units::Count::get(), - "Number of times memory bandwidth was not available."), - ADD_STAT(numDoubleMemReads, statistics::units::Count::get(), - "Number of times a memory block has been read twice. " - "Once for push and once to populate the cache."), - ADD_STAT(verticesPulled, statistics::units::Count::get(), - "Number of times a pull request has been sent by PushEngine."), - ADD_STAT(verticesPushed, statistics::units::Count::get(), - "Number of times a vertex has been pushed to the PushEngine"), - ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(), - "Time of the last pull request. (Relative to reset_stats)"), - ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(), - "Time of the last vertex push. (Relative to reset_stats)"), - ADD_STAT(numInvalidApplies, statistics::units::Count::get(), - "Number of times a line has become busy" - " while waiting to be applied."), - ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(), - "Number of times a scheduled memory function has been invalid."), - ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(), - "Distribution for the location of vertex searches."), - ADD_STAT(hitRate, statistics::units::Ratio::get(), - "Hit rate in the cache."), - ADD_STAT(vertexPullBW, statistics::units::Rate::get(), - "Rate at which pull requests arrive."), - ADD_STAT(vertexPushBW, statistics::units::Rate::get(), - "Rate at which vertices are pushed."), - ADD_STAT(mshrEntryLength, statistics::units::Count::get(), - "Histogram on the length of the mshr entries."), - ADD_STAT(bitvectorLength, statistics::units::Count::get(), - "Histogram of the length of the bitvector."), - ADD_STAT(responseQueueLatency, statistics::units::Second::get(), - "Histogram of the response latency to WLEngine. (ns)"), - ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(), - "Histogram of the latency of processing a memory function.") -{ -} - -void -CoalesceEngine::CoalesceStats::regStats() -{ - using namespace statistics; - - bitvectorSearchStatus.init(NUM_STATUS); - bitvectorSearchStatus.subname(0, "PENDING_READ"); - bitvectorSearchStatus.subname(1, "IN_CACHE"); - bitvectorSearchStatus.subname(2, "IN_MEMORY"); - bitvectorSearchStatus.subname(3, "GARBAGE"); - - hitRate = (readHits + readHitUnderMisses) / - (readHits + readHitUnderMisses + readMisses); - - vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime; - - vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime; - - mshrEntryLength.init(coalesce.params().num_tgts_per_mshr); - bitvectorLength.init(64); - responseQueueLatency.init(64); - memoryFunctionLatency.init(64); -} - -void -CoalesceEngine::CoalesceStats::resetStats() -{ - statistics::Group::resetStats(); - - lastResetTick = curTick(); -} - -} // namespace gem5 diff --git a/src/accl/graph/sega/coalesce_engine_bak.hh b/src/accl/graph/sega/coalesce_engine_bak.hh deleted file mode 100644 index 0787a334c1..0000000000 --- a/src/accl/graph/sega/coalesce_engine_bak.hh +++ /dev/null @@ -1,218 +0,0 @@ -/* - * Copyright (c) 2020 The Regents of the University of California. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer; - * redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution; - * neither the name of the copyright holders nor the names of its - * contributors may be used to endorse or promote products derived from - * this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ -#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ - -#include - -#include "accl/graph/base/data_structs.hh" -#include "accl/graph/base/graph_workload.hh" -#include "accl/graph/sega/base_memory_engine.hh" -#include "base/cprintf.hh" -#include "base/statistics.hh" -#include "params/CoalesceEngine.hh" - - - -namespace gem5 -{ - -enum WorkLocation -{ - PENDING_READ, - IN_CACHE, - IN_MEMORY, - GARBAGE, - NUM_STATUS -}; - -class MPU; - -class CoalesceEngine : public BaseMemoryEngine -{ - private: - struct Block - { - WorkListItem* items; - Addr addr; - uint64_t busyMask; - bool valid; - bool needsApply; - bool needsWB; - bool pendingData; - bool pendingApply; - bool pendingWB; - Tick lastChangedTick; - // TODO: This might be useful in the future - // Tick lastWLWriteTick; - Block() {} - Block(int num_elements): - addr(-1), - busyMask(0), - valid(false), - needsApply(false), - needsWB(false), - pendingData(false), - pendingApply(false), - pendingWB(false), - lastChangedTick(0), - { - items = new WorkListItem [num_elements]; - } - - std::string to_string() { - return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, " - "needsApply: %s, needsWB: %s, pendingData: %s, " - "pendingApply: %s, pendingWB: %s, lastChangedTick: %lu}", - addr, busyMask, valid ? "true" : "false", - needsApply ? "true" : "false", needsWB ? "true" : "false", - pendingData ? "true" : "false", pendingApply ? "true" : "false", - pendingWB ? "true" : "false", lastChangedTick); - } - }; - - struct SenderState : public Packet::SenderState - { - bool isRetry; - SenderState(bool is_retry): isRetry(is_retry) {} - }; - MPU* owner; - GraphWorkload* graphWorkload; - - int numLines; - int numElementsPerLine; - Block* cacheBlocks; - - int onTheFlyReqs; - int numMSHREntries; - int numTgtsPerMSHR; - std::unordered_map> MSHR; - int maxRespPerCycle; - std::deque> responseQueue; - - int _workCount; - int numPullsReceived; - UniqueFIFO applyQueue; - std::bitset needsPush; - std::deque activeBits; - int postPushWBQueueSize; - std::deque> postPushWBQueue; - - int getBlockIndex(Addr addr); - int getBitIndexBase(Addr addr); - Addr getBlockAddrFromBitIndex(int index); - std::tuple getOptimalPullAddr(); - - int maxPotentialPostPushWB; - // A map from addr to sendMask. sendMask determines which bytes to - // send for push when getting the read response from memory. - std::unordered_map pendingVertexPullReads; - - MemoryEvent nextMemoryEvent; - void processNextMemoryEvent(); - void processNextRead(int block_index, Tick schedule_tick); - void processNextWriteBack(int block_index, Tick schedule_tick); - void processNextVertexPull(int ignore, Tick schedule_tick); - void processNextPostPushWB(int ignore, Tick schedule_tick); - std::deque, int, Tick>> memoryFunctionQueue; - - EventFunctionWrapper nextResponseEvent; - void processNextResponseEvent(); - - EventFunctionWrapper nextPreWBApplyEvent; - void processNextPreWBApplyEvent(); - - struct CoalesceStats : public statistics::Group - { - CoalesceStats(CoalesceEngine &coalesce); - - virtual void regStats() override; - - virtual void resetStats() override; - - CoalesceEngine &coalesce; - - Tick lastResetTick; - - statistics::Scalar numVertexReads; - statistics::Scalar numVertexWrites; - statistics::Scalar readHits; - statistics::Scalar readMisses; - statistics::Scalar readHitUnderMisses; - statistics::Scalar mshrEntryShortage; - statistics::Scalar mshrTargetShortage; - statistics::Scalar responsePortShortage; - statistics::Scalar numMemoryBlocks; - statistics::Scalar numDoubleMemReads; - statistics::Scalar verticesPulled; - statistics::Scalar verticesPushed; - statistics::Scalar lastVertexPullTime; - statistics::Scalar lastVertexPushTime; - statistics::Scalar numInvalidApplies; - statistics::Scalar numInvalidWriteBacks; - - statistics::Vector bitvectorSearchStatus; - - statistics::Formula hitRate; - statistics::Formula vertexPullBW; - statistics::Formula vertexPushBW; - - statistics::Histogram mshrEntryLength; - statistics::Histogram bitvectorLength; - statistics::Histogram responseQueueLatency; - statistics::Histogram memoryFunctionLatency; - }; - - CoalesceStats stats; - - protected: - virtual void recvMemRetry() override; - virtual bool handleMemResp(PacketPtr pkt) override; - - public: - PARAMS(CoalesceEngine); - CoalesceEngine(const Params ¶ms); - void registerMPU(MPU* mpu); - - void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; } - virtual void recvFunctional(PacketPtr pkt); - - bool recvWLRead(Addr addr); - void recvWLWrite(Addr addr, WorkListItem wl); - - int workCount() { return _workCount; } - void recvVertexPull(); - - bool done(); -}; - -} - -#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc new file mode 100644 index 0000000000..8c9d223178 --- /dev/null +++ b/src/accl/graph/sega/enums.cc @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/enums.hh" + +namespace gem5 +{ + +const char* cacheStateStrings[NUM_CACHE_STATE] = { + "INVALID", + "PENDING_DATA", + "BUSY", + "IDLE", + "PENDING_WB", + "LOCKED_FOR_APPLY" +}; + + +const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS] = +{ + "ACCEPT", + "REJECT_ROLL", + "REJECT_NO_ROLL" +}; + +const char* readDestinationStrings[NUM_READ_DESTINATION] = +{ + "READ_FOR_CACHE", + "READ_FOR_PUSH" +}; + +} // namespace gem5 diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh new file mode 100644 index 0000000000..e7a8f84452 --- /dev/null +++ b/src/accl/graph/sega/enums.hh @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_ENUMS_HH__ +#define __ACCL_GRAPH_SEGA_ENUMS_HH__ + +namespace gem5 +{ + +enum CacheState +{ + INVALID, + PENDING_DATA, + BUSY, + IDLE, + PENDING_WB, + LOCKED_FOR_APPLY, + NUM_CACHE_STATE +}; +extern const char* cacheStateStrings[NUM_CACHE_STATE]; + +enum ReadReturnStatus +{ + ACCEPT, + REJECT_ROLL, + REJECT_NO_ROLL, + NUM_READ_RETURN_STATUS +}; +extern const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS]; + +enum ReadDestination +{ + READ_FOR_CACHE, + READ_FOR_PUSH, + NUM_READ_DESTINATION +}; +extern const char* readDestinationStrings[NUM_READ_DESTINATION]; + +} // namespace gem5 + +#endif // __ACCL_GRAPH_SEGA_ENUMS_HH__ diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc index b30060238d..f661bd68a6 100644 --- a/src/accl/graph/sega/mpu.cc +++ b/src/accl/graph/sega/mpu.cc @@ -87,12 +87,6 @@ MPU::recvVertexPush(Addr addr, uint32_t delta, pushEngine->recvVertexPush(addr, delta, edge_index, degree); } -void -MPU::recvPrevPullCorrection() -{ - pushEngine->recvPrevPullCorrection(); -} - void MPU::recvDoneSignal() { diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index 8f3b29f603..ad18a0d5a5 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -34,6 +34,7 @@ #include "accl/graph/base/data_structs.hh" #include "accl/graph/sega/coalesce_engine.hh" +#include "accl/graph/sega/enums.hh" #include "accl/graph/sega/push_engine.hh" #include "accl/graph/sega/wl_engine.hh" #include "base/addr_range.hh" @@ -64,10 +65,12 @@ class MPU : public SimObject AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); } void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); } + void postMemInitSetup() { coalesceEngine->postMemInitSetup(); } + bool handleIncomingUpdate(PacketPtr pkt); void handleIncomingWL(Addr addr, WorkListItem wl); - bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); } + ReadReturnStatus recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); } void recvWLWrite(Addr addr, WorkListItem wl); void recvWorkload(GraphWorkload* Workload); @@ -77,7 +80,6 @@ class MPU : public SimObject void start() { return pushEngine->start(); } void recvVertexPush(Addr addr, uint32_t delta, uint32_t edge_index, uint32_t degree); - void recvPrevPullCorrection(); void recvDoneSignal(); bool done(); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 07f37a28dc..a17991e335 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -155,13 +155,13 @@ void PushEngine::start() { assert(!_running); - assert(!nextVertexPullEvent.scheduled()); + // assert(!nextVertexPullEvent.scheduled()); _running = true; stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick); // NOTE: We might have to check for size availability here. assert(workLeft()); - if (vertexSpace()) { + if (vertexSpace() && !nextVertexPullEvent.scheduled()) { schedule(nextVertexPullEvent, nextCycle()); } } @@ -169,17 +169,16 @@ PushEngine::start() void PushEngine::processNextVertexPullEvent() { - // TODO: change edgePointerQueueSize - numPendingPulls++; - owner->recvVertexPull(); - - if (!workLeft()) { + if (workLeft()) { + numPendingPulls++; + owner->recvVertexPull(); + if (vertexSpace() && (!nextVertexPullEvent.scheduled())) { + schedule(nextVertexPullEvent, nextCycle()); + } + } else { _running = false; lastIdleEntranceTick = curTick(); - } - - if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) { - schedule(nextVertexPullEvent, nextCycle()); + DPRINTF(PushEngine, "%s: In idle state now.\n", __func__); } } @@ -197,9 +196,9 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta, sizeof(Edge), peerMemoryAtomSize); edgePointerQueue.emplace_back(info_gen, curTick()); - numPendingPulls--; - if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) { + + if (vertexSpace() && (!nextVertexPullEvent.scheduled())) { schedule(nextVertexPullEvent, nextCycle()); } @@ -209,16 +208,6 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta, } } -void -PushEngine::recvPrevPullCorrection() -{ - assert(numPendingPulls > 0); - numPendingPulls--; - if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) { - schedule(nextVertexPullEvent, nextCycle()); - } -} - void PushEngine::processNextMemoryReadEvent() { @@ -255,7 +244,7 @@ PushEngine::processNextMemoryReadEvent() } } - if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) { + if (vertexSpace() && (!nextVertexPullEvent.scheduled())) { schedule(nextVertexPullEvent, nextCycle()); } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 2e1de25390..08cceb14f0 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -32,6 +32,7 @@ #include "accl/graph/base/data_structs.hh" #include "accl/graph/base/graph_workload.hh" #include "accl/graph/sega/base_memory_engine.hh" +#include "accl/graph/sega/enums.hh" #include "base/intmath.hh" #include "params/PushEngine.hh" @@ -199,7 +200,6 @@ class PushEngine : public BaseMemoryEngine bool running() { return _running; } void recvVertexPush(Addr addr, uint32_t delta, uint32_t edge_index, uint32_t degree); - void recvPrevPullCorrection(); void recvReqRetry(); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index a698f2cc0a..2b305e1557 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -134,7 +134,7 @@ WLEngine::RespPort::recvRespRetry() void WLEngine::checkRetryReq() { - for (int i = 0; i < inPorts.size(); ++i) { + for (int i = 0; i < inPorts.size(); i++) { inPorts[i].checkRetryReq(); } } @@ -191,12 +191,8 @@ WLEngine::processNextReadEvent() if (registerFile.size() < registerFileSize) { DPRINTF(WLEngine, "%s: There are free registers available in the " "registerFile.\n", __func__); - // TODO: It might be a good idea for WLEngine to act differently - // on cache rejects. As a first step the cache should not just - // return a boolean value. It should return an integer/enum - // to tell WLEngine why it rejected the read request. Their might - // be things that WLEngine can do to fix head of the line blocking. - if (owner->recvWLRead(update_addr)) { + ReadReturnStatus read_status = owner->recvWLRead(update_addr); + if (read_status == ReadReturnStatus::ACCEPT) { DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read " "request to addr: %lu.\n", __func__, update_addr); registerFile[update_addr] = update_value; @@ -209,7 +205,8 @@ WLEngine::processNextReadEvent() "registerFileSize = %d.\n", __func__, update_addr, update_value, registerFile.size(), registerFileSize); updateQueue.pop_front(); - stats.updateQueueLatency.sample((curTick() - enter_tick) * 1e9 / getClockFrequency()); + stats.updateQueueLatency.sample( + (curTick() - enter_tick) * 1e9 / getClockFrequency()); DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) " "from updateQueue. updateQueue.size = %d. " "updateQueueSize = %d.\n", __func__, update_addr, @@ -220,6 +217,17 @@ WLEngine::processNextReadEvent() update_value, updateQueue.size(), updateQueueSize); checkRetryReq(); vertexReadTime[update_addr] = curTick(); + } else { + if (read_status == ReadReturnStatus::REJECT_ROLL) { + updateQueue.pop_front(); + updateQueue.emplace_back( + update_addr, update_value, enter_tick); + DPRINTF(WLEngine, "%s: Received a reject from cache. " + "Rolling the update.\n", __func__); + } else { + DPRINTF(WLEngine, "%s: Received a reject from cache. " + "Not rolling the update.\n", __func__); + } } } else { DPRINTF(WLEngine, "%s: There are no free registers " @@ -227,7 +235,6 @@ WLEngine::processNextReadEvent() stats.registerShortage++; } } else { - // TODO: Generalize this to reduce function rather than just min DPRINTF(WLEngine, "%s: A register has already been allocated for " "addr: %lu in registerFile. registerFile[%lu] = %u.\n", __func__, update_addr, update_addr, registerFile[update_addr]); @@ -238,7 +245,8 @@ WLEngine::processNextReadEvent() update_value, update_addr, registerFile[update_addr]); stats.registerFileCoalesce++; updateQueue.pop_front(); - stats.updateQueueLatency.sample((curTick() - enter_tick) * 1e9 / getClockFrequency()); + stats.updateQueueLatency.sample( + (curTick() - enter_tick) * 1e9 / getClockFrequency()); DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) " "from updateQueue. updateQueue.size = %d. " "updateQueueSize = %d.\n", __func__, update_addr, diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index f442d6060e..b5ad3d9040 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -35,6 +35,7 @@ #include "accl/graph/base/base_reduce_engine.hh" #include "accl/graph/base/graph_workload.hh" #include "accl/graph/base/data_structs.hh" +#include "accl/graph/sega/enums.hh" #include "base/statistics.hh" #include "params/WLEngine.hh" diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh new file mode 100644 index 0000000000..4102e29cd3 --- /dev/null +++ b/src/accl/graph/sega/work_directory.hh @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__ +#define __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__ + +#include "base/addr_range.hh" +#include "base/types.hh" + +namespace gem5 +{ + +class WorkDirectory +{ + public: + virtual void activate(Addr atom_addr) = 0; + virtual void deactivate(Addr atom_addr) = 0; + virtual Addr getNextWork() = 0; + + virtual int workCount() = 0; + bool empty() { return workCount() == 0; } + + virtual void setLastAtomAddr(Addr atom_addr) = 0; +}; + +class PopCountDirectory: public WorkDirectory +{ + private: + AddrRange memoryRange; + + int numAtomsPerBlock; + int memoryAtomSize; + int blockSize; + + uint32_t _workCount; + + int numCounters; + int lastCounterIndex; + uint32_t* popCount; + + uint32_t currentIndex; + uint32_t currentCounter; + + int getIndexFromAtomAddr(Addr atom_addr) + { + assert((atom_addr % memoryAtomSize) == 0); + Addr trimmed_addr = memoryRange.removeIntlvBits(atom_addr); + int index = (int) (trimmed_addr / blockSize); + return index; + } + + Addr getAtomAddrFromIndex(int block_index, int atom_index) + { + Addr block_addr = block_index * blockSize; + Addr trimmed_addr = block_addr + atom_index * memoryAtomSize; + return memoryRange.addIntlvBits(trimmed_addr); + } + + public: + PopCountDirectory(AddrRange mem_range, int atoms_per_block, int atom_size): + WorkDirectory(), + memoryRange(mem_range), numAtomsPerBlock(atoms_per_block), + memoryAtomSize(atom_size), _workCount(0), + currentIndex(0), currentCounter(0) + { + blockSize = numAtomsPerBlock * memoryAtomSize; + int numCounters = (int) (memoryRange.size() / blockSize); + lastCounterIndex = numCounters - 1; + popCount = new uint32_t [numCounters]; + for (int index = 0; index < numCounters; index++) { + popCount[index] = 0; + } + } + + // CAUTION: This should only be called when the work + // directory **is not** tracking the the atom with atom_addr + virtual void activate(Addr atom_addr) + { + int index = getIndexFromAtomAddr(atom_addr); + uint32_t prev_count = popCount[index]; + popCount[index]++; + _workCount++; + assert(popCount[index] > prev_count); + assert(popCount[index] <= numAtomsPerBlock); + } + + // CAUTION: This should only be called when the work + // directory **is** tracking the the atom with atom_addr + virtual void deactivate(Addr atom_addr) + { + int index = getIndexFromAtomAddr(atom_addr); + uint32_t prev_count = popCount[index]; + popCount[index]--; + _workCount--; + assert(popCount[index] < prev_count); + assert(popCount[index] <= numAtomsPerBlock); + } + + virtual int workCount() { return _workCount; } + + void setLastAtomAddr(Addr atom_addr) + { + lastCounterIndex = getIndexFromAtomAddr(atom_addr); + } + + // CAUTION: If this function returns an addr that + // is in the cache, that addr should be ignored. + // CAUTION: The receiver should track the last n + // addresses that this WorkDirectory has generated. + // where n is equal to the size of the entry holding + // reads generated by this WorkDirectory. In case + // the WorkDirectory generates a repeated address + // it should be ignored. + // FIXME: This should return garbage if it can't find anything. + // virtual Addr getNextWork() + // { + // if ((currentCounter == numAtomsPerBlock) || + // (popCount[currentIndex] == 0)) { + // int prev_index = currentIndex; + // while (true) { + // currentIndex++; + // // NOTE: this is an optimization. + // // lastCounterIndex tracks the last blockOfAtom that + // // has vertices. By default it is set to numCounters - 1. + // // However, it might not be necessary to track all the + // // numCounters counters. e.g. If this WorkDirectory is tracking + // // a 512 MiB memory with atom size of 32 B and 256 atoms + // // per block. Then it needs 64 Ki counters of 8 bit wide. + // // However, if we need 8 Mi atoms to store all our vertices, + // // the second half of the counters would not be used at all + // // (512 MiB hold 16 Mi atoms and we're only using half). + // if (currentIndex > lastCounterIndex) { + // currentIndex = 0; + // } + // if (prev_index == currentIndex) { + // // NOTE: If we have reached the same index as before, + // // we need to decrement the currentCounter to generate + // // a repeatative address. This way the receiver can detect + // // the uselessness of the generated address and ignore it + // currentCounter--; + // break; + // } + // if (popCount[currentIndex] > 0) { + // currentCounter = 0; + // break; + // } + // } + // } + // Addr ret_addr = getAtomAddrFromIndex(currentIndex, currentCounter); + // currentCounter++; + + // return ret_addr; + // } + + virtual Addr getNextWork() + { + if ((currentCounter == numAtomsPerBlock) || + (popCount[currentIndex] == 0)) { + int other_count = _workCount - popCount[currentIndex]; + if (other_count == 0) { + currentCounter = 0; + } else { + int prev_index = currentIndex; + while (true) { + currentIndex++; + if (currentIndex > lastCounterIndex) { + currentIndex = 0; + } + if (currentIndex == prev_index) { + break; + } + if (popCount[currentIndex] > 0) { + break; + } + } + currentCounter = 0; + } + } + Addr ret_addr = getAtomAddrFromIndex(currentIndex, currentCounter); + currentCounter++; + return ret_addr; + } +}; + +} // namespace gem5 + +#endif // __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__ diff --git a/src/mem/mem_ctrl.cc b/src/mem/mem_ctrl.cc index c65d68a5a7..3cbacef800 100644 --- a/src/mem/mem_ctrl.cc +++ b/src/mem/mem_ctrl.cc @@ -212,7 +212,7 @@ MemCtrl::addToReadQueue(PacketPtr pkt, for (int cnt = 0; cnt < pkt_count; ++cnt) { unsigned size = std::min((addr | (burst_size - 1)) + 1, base_addr + pkt->getSize()) - addr; - stats.readPktSize[ceilLog2(size)]++; + // stats.readPktSize[ceilLog2(size)]++; stats.readBursts++; stats.requestorReadAccesses[pkt->requestorId()]++; From c4fc96e2146aeec5e7a978c11dfd4e5b36a7a67b Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 7 Nov 2022 19:53:35 -0800 Subject: [PATCH 212/247] Adding new stats. --- configs/accl/sega.py | 12 ++++-- src/accl/graph/sega/CoalesceEngine.py | 2 - src/accl/graph/sega/coalesce_engine.cc | 51 ++++++++++++-------------- src/accl/graph/sega/coalesce_engine.hh | 4 +- src/accl/graph/sega/push_engine.cc | 16 ++++++-- src/accl/graph/sega/push_engine.hh | 5 ++- 6 files changed, 51 insertions(+), 39 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 54f22b1377..7baa27fd5e 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -53,7 +53,6 @@ def __init__(self, edge_memory_size: str, cache_size: str): self.coalesce_engine = CoalesceEngine( attached_memory_atom_size=32, cache_size=cache_size, - num_mshr_entry=64, max_resp_per_cycle=8, active_buffer_size = 64, post_push_wb_queue_size=64, @@ -61,7 +60,7 @@ def __init__(self, edge_memory_size: str, cache_size: str): self.push_engine = PushEngine( push_req_queue_size=32, attached_memory_atom_size=64, - resp_queue_size=512, + resp_queue_size=4096, update_queue_size=32, ) @@ -74,7 +73,11 @@ def __init__(self, edge_memory_size: str, cache_size: str): range=AddrRange(edge_memory_size), in_addr_map=False ) ) - + # self.edge_mem_ctrl = SimpleMemory(latency="90ns", + # latency_var="0ns", + # bandwidth="18GiB/s", + # range=AddrRange(edge_memory_size), + # in_addr_map=False) self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port self.push_engine.mem_port = self.edge_mem_ctrl.port @@ -105,6 +108,9 @@ def set_vertex_pch_bit(self, pch_bit): def set_edge_image(self, edge_image): self.edge_mem_ctrl.dram.image_file = edge_image + # def set_edge_image(self, edge_image): + # self.edge_mem_ctrl.image_file = edge_image + class SEGA(System): diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index a447dedc3d..76e7d262e8 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -37,8 +37,6 @@ class CoalesceEngine(BaseMemoryEngine): cache_size = Param.MemorySize("Size of the internal SRAM array.") - num_mshr_entry = Param.Int("Number of MSHR entries.") - max_resp_per_cycle = Param.Int("Maximum number of vertices to send to " "requestor in each cycle. Used to limit b/w.") active_buffer_size = Param.Int("Maximum number of memory active memory " diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 0aa61345f7..d7cf173097 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -46,7 +46,7 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): BaseMemoryEngine(params), lastAtomAddr(0), numLines((int) (params.cache_size / peerMemoryAtomSize)), numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), - onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry), + onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle), pullsReceived(0), pullsScheduled(0), pendingPullReads(0), activeBufferSize(params.active_buffer_size), @@ -227,7 +227,6 @@ CoalesceEngine::recvWLRead(Addr addr) assert(cacheBlocks[block_index].busyMask == 0); assert(!cacheBlocks[block_index].dirty); - assert(MSHR.size() <= numMSHREntries); assert(MSHR.find(block_index) != MSHR.end()); MSHR[block_index].push_back(addr); DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to MSHR " @@ -239,7 +238,6 @@ CoalesceEngine::recvWLRead(Addr addr) } else { // miss assert(cacheBlocks[block_index].addr != aligned_addr); - assert(MSHR.size() <= numMSHREntries); DPRINTF(CoalesceEngine, "%s: Addr: %lu is a miss.\n", __func__, addr); if (cacheBlocks[block_index].state != CacheState::INVALID) { @@ -284,29 +282,26 @@ CoalesceEngine::recvWLRead(Addr addr) } else { // cold miss assert(MSHR.find(block_index) == MSHR.end()); - if (MSHR.size() < numMSHREntries) { - cacheBlocks[block_index].addr = aligned_addr; - cacheBlocks[block_index].busyMask = 0; - cacheBlocks[block_index].valid = false; - cacheBlocks[block_index].dirty = false; - cacheBlocks[block_index].hasConflict = false; - cacheBlocks[block_index].state = CacheState::PENDING_DATA; - cacheBlocks[block_index].lastChangedTick = curTick(); + cacheBlocks[block_index].addr = aligned_addr; + cacheBlocks[block_index].busyMask = 0; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].dirty = false; + cacheBlocks[block_index].hasConflict = false; + cacheBlocks[block_index].state = CacheState::PENDING_DATA; + cacheBlocks[block_index].lastChangedTick = curTick(); - MSHR[block_index].push_back(addr); - memoryFunctionQueue.emplace_back( - [this] (int block_index, Tick schedule_tick) { - processNextRead(block_index, schedule_tick); - }, block_index, curTick()); - if ((!nextMemoryEvent.pending()) && - (!nextMemoryEvent.scheduled())) { - schedule(nextMemoryEvent, nextCycle()); - } - return ReadReturnStatus::ACCEPT; - } else { - return ReadReturnStatus::REJECT_ROLL; + MSHR[block_index].push_back(addr); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextRead(block_index, schedule_tick); + }, block_index, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); } + return ReadReturnStatus::ACCEPT; } + stats.readMisses++; } } @@ -939,6 +934,8 @@ CoalesceEngine::processNextApplyEvent() owner->recvVertexPush(addr, delta, items[index].edgeIndex, items[index].degree); pullsReceived--; + stats.verticesPushed++; + stats.lastVertexPushTime = curTick() - stats.lastResetTick; } } pkt->deleteData(); @@ -986,6 +983,8 @@ CoalesceEngine::processNextApplyEvent() cacheBlocks[block_index].items[index].edgeIndex, cacheBlocks[block_index].items[index].degree); pullsReceived--; + stats.verticesPushed++; + stats.lastVertexPushTime = curTick() - stats.lastResetTick; } } @@ -1057,8 +1056,6 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "Number of cache misses."), ADD_STAT(readHitUnderMisses, statistics::units::Count::get(), "Number of cache hit under misses."), - ADD_STAT(mshrEntryShortage, statistics::units::Count::get(), - "Number of cache rejections caused by entry shortage."), ADD_STAT(responsePortShortage, statistics::units::Count::get(), "Number of times a response has been " "delayed because of port shortage. "), @@ -1082,7 +1079,7 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) ADD_STAT(vertexPushBW, statistics::units::Rate::get(), "Rate at which vertices are pushed."), - ADD_STAT(bitvectorLength, statistics::units::Count::get(), + ADD_STAT(frontierSize, statistics::units::Count::get(), "Histogram of the length of the bitvector."), ADD_STAT(responseQueueLatency, statistics::units::Second::get(), "Histogram of the response latency to WLEngine. (ns)"), @@ -1103,7 +1100,7 @@ CoalesceEngine::CoalesceStats::regStats() vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime; - bitvectorLength.init(64); + frontierSize.init(64); responseQueueLatency.init(64); memoryFunctionLatency.init(64); } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index c457b214f9..f87e0027a2 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -106,7 +106,6 @@ class CoalesceEngine : public BaseMemoryEngine Block* cacheBlocks; int onTheFlyReqs; - int numMSHREntries; std::unordered_map> MSHR; // Response route to WLEngine @@ -167,7 +166,6 @@ class CoalesceEngine : public BaseMemoryEngine statistics::Scalar readHits; statistics::Scalar readMisses; statistics::Scalar readHitUnderMisses; - statistics::Scalar mshrEntryShortage; statistics::Scalar responsePortShortage; statistics::Scalar numMemoryBlocks; statistics::Scalar verticesPulled; @@ -180,7 +178,7 @@ class CoalesceEngine : public BaseMemoryEngine statistics::Formula vertexPullBW; statistics::Formula vertexPushBW; - statistics::Histogram bitvectorLength; + statistics::Histogram frontierSize; statistics::Histogram responseQueueLatency; statistics::Histogram memoryFunctionLatency; }; diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index a17991e335..09f29a43e4 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -158,7 +158,7 @@ PushEngine::start() // assert(!nextVertexPullEvent.scheduled()); _running = true; - stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick); + // stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick); // NOTE: We might have to check for size availability here. assert(workLeft()); if (vertexSpace() && !nextVertexPullEvent.scheduled()) { @@ -196,6 +196,7 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta, sizeof(Edge), peerMemoryAtomSize); edgePointerQueue.emplace_back(info_gen, curTick()); + stats.edgePointerQueueLength.sample(edgePointerQueue.size()); numPendingPulls--; if (vertexSpace() && (!nextVertexPullEvent.scheduled())) { @@ -239,6 +240,7 @@ PushEngine::processNextMemoryReadEvent() stats.edgePointerQueueLatency.sample( (curTick() - entrance_tick) * 1e9 / getClockFrequency()); edgePointerQueue.pop_front(); + stats.edgePointerQueueLength.sample(edgePointerQueue.size()); DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. " "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size()); } @@ -282,6 +284,7 @@ PushEngine::handleMemResp(PacketPtr pkt) MetaEdge meta_edge( push_info.src, edge_dst, edge_weight, push_info.value); metaEdgeQueue.emplace_back(meta_edge, curTick()); + stats.edgeQueueLength.sample(metaEdgeQueue.size()); } stats.numWastefulEdgesRead += (peerMemoryAtomSize / sizeof(Edge)) - push_info.numElements; @@ -320,6 +323,7 @@ PushEngine::processNextPropagateEvent() stats.numPropagates++; stats.edgeQueueLatency.sample( (curTick() - entrance_tick) * 1e9 / getClockFrequency()); + stats.edgeQueueLength.sample(metaEdgeQueue.size()); } else { metaEdgeQueue.emplace_back(meta_edge, entrance_tick); } @@ -466,8 +470,8 @@ PushEngine::PushStats::PushStats(PushEngine &_push) "Number of propagate operations done."), ADD_STAT(numNetBlocks, statistics::units::Count::get(), "Number of updates blocked by network."), - ADD_STAT(numIdleCycles, statistics::units::Count::get(), - "Number of cycles PushEngine has been idle."), + // ADD_STAT(numIdleCycles, statistics::units::Count::get(), + // "Number of cycles PushEngine has been idle."), ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(), "Number of coalescions in the update queues."), ADD_STAT(numUpdates, statistics::units::Count::get(), @@ -479,8 +483,12 @@ PushEngine::PushStats::PushStats(PushEngine &_push) "Traversed Edges Per Second."), ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(), "Histogram of the latency of the edgePointerQueue."), + ADD_STAT(edgePointerQueueLength, statistics::units::Count::get(), + "Histogram of the size of the edgePointerQueue."), ADD_STAT(edgeQueueLatency, statistics::units::Second::get(), "Histogram of the latency of the metaEdgeQueue."), + ADD_STAT(edgeQueueLength, statistics::units::Count::get(), + "Histogram of the size of the metaEdgeQueue."), ADD_STAT(updateQueueLength, statistics::units::Count::get(), "Histogram of the length of updateQueues."), ADD_STAT(numPropagatesHist, statistics::units::Count::get(), @@ -496,7 +504,9 @@ PushEngine::PushStats::regStats() TEPS = numPropagates / simSeconds; edgePointerQueueLatency.init(64); + edgePointerQueueLength.init(64); edgeQueueLatency.init(64); + edgeQueueLength.init(64); updateQueueLength.init(64); numPropagatesHist.init(push.params().max_propagates_per_cycle); } diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh index 08cceb14f0..f51865acb3 100644 --- a/src/accl/graph/sega/push_engine.hh +++ b/src/accl/graph/sega/push_engine.hh @@ -164,9 +164,10 @@ class PushEngine : public BaseMemoryEngine PushEngine &push; + statistics::Scalar numMemoryBlocks; statistics::Scalar numPropagates; statistics::Scalar numNetBlocks; - statistics::Scalar numIdleCycles; + // statistics::Scalar numIdleCycles; statistics::Scalar updateQueueCoalescions; statistics::Scalar numUpdates; statistics::Scalar numWastefulEdgesRead; @@ -174,7 +175,9 @@ class PushEngine : public BaseMemoryEngine statistics::Formula TEPS; statistics::Histogram edgePointerQueueLatency; + statistics::Histogram edgePointerQueueLength; statistics::Histogram edgeQueueLatency; + statistics::Histogram edgeQueueLength; statistics::Histogram updateQueueLength; statistics::Histogram numPropagatesHist; }; From b68602b864a995b5d5a248fb5364f973fc2ace3b Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 8 Nov 2022 07:36:05 -0800 Subject: [PATCH 213/247] Adding state. --- configs/accl/bfs.py | 35 ++++++++++++++++++++------ configs/accl/sega.py | 6 +---- src/accl/graph/sega/PushEngine.py | 4 +-- src/accl/graph/sega/coalesce_engine.cc | 26 +++++++++++++------ src/accl/graph/sega/coalesce_engine.hh | 1 + src/accl/graph/sega/work_directory.hh | 10 +++++--- 6 files changed, 57 insertions(+), 25 deletions(-) diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py index a201acd4d1..80331e3aad 100644 --- a/configs/accl/bfs.py +++ b/configs/accl/bfs.py @@ -47,6 +47,14 @@ def get_inputs(): default=False, help="Print final answer", ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample statistics", + ) args = argparser.parse_args() @@ -56,24 +64,37 @@ def get_inputs(): args.graph, args.init_addr, args.init_value, + args.sample, args.verify, ) if __name__ == "__m5_main__": - num_gpts, cache_size, graph, init_addr, init_value, verify = get_inputs() + num_gpts, cache_size, graph, init_addr, init_value, sample, verify = get_inputs() system = SEGA(num_gpts, cache_size, graph) root = Root(full_system=False, system=system) m5.instantiate() - system.create_pop_count_directory(256) + system.create_pop_count_directory(64) system.create_bfs_workload(init_addr, init_value) - exit_event = m5.simulate() - print( - f"Exited simulation at tick {m5.curTick()} " - + f"because {exit_event.getCause()}" - ) + if sample: + while True: + exit_event = m5.simulate(10000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) if verify: system.print_answer() diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 7baa27fd5e..29a017ba65 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -61,6 +61,7 @@ def __init__(self, edge_memory_size: str, cache_size: str): push_req_queue_size=32, attached_memory_atom_size=64, resp_queue_size=4096, + max_propagates_per_cycle=8, update_queue_size=32, ) @@ -73,11 +74,6 @@ def __init__(self, edge_memory_size: str, cache_size: str): range=AddrRange(edge_memory_size), in_addr_map=False ) ) - # self.edge_mem_ctrl = SimpleMemory(latency="90ns", - # latency_var="0ns", - # bandwidth="18GiB/s", - # range=AddrRange(edge_memory_size), - # in_addr_map=False) self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port self.push_engine.mem_port = self.edge_mem_ctrl.port diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py index 20c5452d43..63fa1eae62 100644 --- a/src/accl/graph/sega/PushEngine.py +++ b/src/accl/graph/sega/PushEngine.py @@ -42,8 +42,8 @@ class PushEngine(BaseMemoryEngine): "push engine where it stores the " "edges read from memory.") - max_propagates_per_cycle = Param.Int(4, "Maximum number of propagates " - "done per cycle.") + max_propagates_per_cycle = Param.Int("Maximum number of propagates " + "done per cycle.") update_queue_size = Param.Int("Maximum number of entries " "for each update queue.") diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index d7cf173097..adb33064f7 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -239,7 +239,7 @@ CoalesceEngine::recvWLRead(Addr addr) // miss assert(cacheBlocks[block_index].addr != aligned_addr); DPRINTF(CoalesceEngine, "%s: Addr: %lu is a miss.\n", __func__, addr); - + stats.readMisses++; if (cacheBlocks[block_index].state != CacheState::INVALID) { // conflict miss DPRINTF(CoalesceEngine, "%s: Addr: %lu has conflict with " @@ -268,7 +268,9 @@ CoalesceEngine::recvWLRead(Addr addr) } if (atom_active) { activeCacheBlocks.erase(block_index); - directory->activate(cacheBlocks[block_index].addr); + int count = directory->activate(cacheBlocks[block_index].addr); + stats.blockActiveCount.sample(count); + stats.frontierSize.sample(directory->workCount()); } // NOTE: Bring the cache line to invalid state. // NOTE: Above line where we set hasConflict to true @@ -301,7 +303,6 @@ CoalesceEngine::recvWLRead(Addr addr) } return ReadReturnStatus::ACCEPT; } - stats.readMisses++; } } @@ -376,8 +377,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) cacheBlocks[block_index].items[index]); } if (atom_active) { - directory->deactivate(addr); + int count = directory->deactivate(addr); activeCacheBlocks.push_back(block_index); + stats.blockActiveCount.sample(count); + stats.frontierSize.sample(directory->workCount()); } assert(MSHR.find(block_index) != MSHR.end()); @@ -433,8 +436,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) atom_active |= graphWorkload->activeCondition(items[index]); } if (atom_active) { - directory->deactivate(addr); + int count = directory->deactivate(addr); activeBuffer.emplace_back(pkt, curTick()); + stats.blockActiveCount.sample(count); + stats.frontierSize.sample(directory->workCount()); DPRINTF(MSDebug, "%s: Empalced pkt: %s in activeBuffer. " "activeBuffer.size: %d.\n", __func__, pkt->print(), activeBuffer.size()); @@ -591,7 +596,9 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) } if (atom_active) { activeCacheBlocks.erase(block_index); - directory->activate(cacheBlocks[block_index].addr); + int count = directory->activate(cacheBlocks[block_index].addr); + stats.blockActiveCount.sample(count); + stats.frontierSize.sample(directory->workCount()); } cacheBlocks[block_index].reset(); } @@ -804,7 +811,9 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) } if (atom_active) { activeCacheBlocks.erase(block_index); - directory->activate(cacheBlocks[block_index].addr); + int count = directory->activate(cacheBlocks[block_index].addr); + stats.blockActiveCount.sample(count); + stats.frontierSize.sample(directory->workCount()); } PacketPtr pkt = createWritePacket( @@ -1081,6 +1090,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "Rate at which vertices are pushed."), ADD_STAT(frontierSize, statistics::units::Count::get(), "Histogram of the length of the bitvector."), + ADD_STAT(blockActiveCount, statistics::units::Count::get(), + "Histogram of the popCount values in the directory"), ADD_STAT(responseQueueLatency, statistics::units::Second::get(), "Histogram of the response latency to WLEngine. (ns)"), ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(), @@ -1101,6 +1112,7 @@ CoalesceEngine::CoalesceStats::regStats() vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime; frontierSize.init(64); + blockActiveCount.init(64); responseQueueLatency.init(64); memoryFunctionLatency.init(64); } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index f87e0027a2..b855fda38b 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -179,6 +179,7 @@ class CoalesceEngine : public BaseMemoryEngine statistics::Formula vertexPushBW; statistics::Histogram frontierSize; + statistics::Histogram blockActiveCount; statistics::Histogram responseQueueLatency; statistics::Histogram memoryFunctionLatency; }; diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh index 4102e29cd3..35778686c8 100644 --- a/src/accl/graph/sega/work_directory.hh +++ b/src/accl/graph/sega/work_directory.hh @@ -38,8 +38,8 @@ namespace gem5 class WorkDirectory { public: - virtual void activate(Addr atom_addr) = 0; - virtual void deactivate(Addr atom_addr) = 0; + virtual int activate(Addr atom_addr) = 0; + virtual int deactivate(Addr atom_addr) = 0; virtual Addr getNextWork() = 0; virtual int workCount() = 0; @@ -99,7 +99,7 @@ class PopCountDirectory: public WorkDirectory // CAUTION: This should only be called when the work // directory **is not** tracking the the atom with atom_addr - virtual void activate(Addr atom_addr) + virtual int activate(Addr atom_addr) { int index = getIndexFromAtomAddr(atom_addr); uint32_t prev_count = popCount[index]; @@ -107,11 +107,12 @@ class PopCountDirectory: public WorkDirectory _workCount++; assert(popCount[index] > prev_count); assert(popCount[index] <= numAtomsPerBlock); + return popCount[index]; } // CAUTION: This should only be called when the work // directory **is** tracking the the atom with atom_addr - virtual void deactivate(Addr atom_addr) + virtual int deactivate(Addr atom_addr) { int index = getIndexFromAtomAddr(atom_addr); uint32_t prev_count = popCount[index]; @@ -119,6 +120,7 @@ class PopCountDirectory: public WorkDirectory _workCount--; assert(popCount[index] < prev_count); assert(popCount[index] <= numAtomsPerBlock); + return popCount[index]; } virtual int workCount() { return _workCount; } From ec5025f2b3b1143ed9c1663e47464d937705ded3 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 8 Nov 2022 15:00:00 -0800 Subject: [PATCH 214/247] Adding stat to count number of conflict misses. --- src/accl/graph/sega/coalesce_engine.cc | 3 +++ src/accl/graph/sega/coalesce_engine.hh | 1 + 2 files changed, 4 insertions(+) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index adb33064f7..8c636615cd 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -279,6 +279,7 @@ CoalesceEngine::recvWLRead(Addr addr) } return ReadReturnStatus::REJECT_NO_ROLL; } else { + stats.numConflicts++; return ReadReturnStatus::REJECT_ROLL; } } else { @@ -1065,6 +1066,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "Number of cache misses."), ADD_STAT(readHitUnderMisses, statistics::units::Count::get(), "Number of cache hit under misses."), + ADD_STAT(numConflicts, statistics::units::Count::get(), + "Number of conflicts raised by reads in the cache."), ADD_STAT(responsePortShortage, statistics::units::Count::get(), "Number of times a response has been " "delayed because of port shortage. "), diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index b855fda38b..c2da6a90cd 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -166,6 +166,7 @@ class CoalesceEngine : public BaseMemoryEngine statistics::Scalar readHits; statistics::Scalar readMisses; statistics::Scalar readHitUnderMisses; + statistics::Scalar numConflicts; statistics::Scalar responsePortShortage; statistics::Scalar numMemoryBlocks; statistics::Scalar verticesPulled; From ca971137593af82054c428ea6d8bca8e949463d0 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 8 Nov 2022 15:17:20 -0800 Subject: [PATCH 215/247] Adding stat to count the number of update rolls. --- src/accl/graph/sega/coalesce_engine.cc | 3 --- src/accl/graph/sega/enums.cc | 3 +-- src/accl/graph/sega/enums.hh | 1 - src/accl/graph/sega/wl_engine.cc | 4 ++++ src/accl/graph/sega/wl_engine.hh | 1 + 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 8c636615cd..b9ac25c502 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -179,9 +179,6 @@ CoalesceEngine::recvWLRead(Addr addr) if ((cacheBlocks[block_index].addr == aligned_addr) && (cacheBlocks[block_index].valid)) { // Hit - if (cacheBlocks[block_index].state == CacheState::LOCKED_FOR_APPLY) { - return ReadReturnStatus::REJECT_NO_ROLL; - } DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit.\n", __func__, addr); stats.readHits++; assert(cacheBlocks[block_index].state != CacheState::INVALID); diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc index 8c9d223178..de5d569c18 100644 --- a/src/accl/graph/sega/enums.cc +++ b/src/accl/graph/sega/enums.cc @@ -36,8 +36,7 @@ const char* cacheStateStrings[NUM_CACHE_STATE] = { "PENDING_DATA", "BUSY", "IDLE", - "PENDING_WB", - "LOCKED_FOR_APPLY" + "PENDING_WB" }; diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh index e7a8f84452..6153386b71 100644 --- a/src/accl/graph/sega/enums.hh +++ b/src/accl/graph/sega/enums.hh @@ -39,7 +39,6 @@ enum CacheState BUSY, IDLE, PENDING_WB, - LOCKED_FOR_APPLY, NUM_CACHE_STATE }; extern const char* cacheStateStrings[NUM_CACHE_STATE]; diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index 2b305e1557..ed91622b43 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -224,6 +224,7 @@ WLEngine::processNextReadEvent() update_addr, update_value, enter_tick); DPRINTF(WLEngine, "%s: Received a reject from cache. " "Rolling the update.\n", __func__); + stats.numUpdateRolls++; } else { DPRINTF(WLEngine, "%s: Received a reject from cache. " "Not rolling the update.\n", __func__); @@ -330,6 +331,9 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl) ADD_STAT(registerShortage, statistics::units::Count::get(), "Number of times updates were " "stalled because of register shortage"), + ADD_STAT(numUpdateRolls, statistics::units::Count::get(), + "Number of times an update has been rolled back " + "to the back of the update queue due to cache reject."), ADD_STAT(vertexReadLatency, statistics::units::Second::get(), "Histogram of the latency of reading a vertex (ns)."), ADD_STAT(updateQueueLatency, statistics::units::Second::get(), diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index b5ad3d9040..45baaa1e79 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -101,6 +101,7 @@ class WLEngine : public BaseReduceEngine statistics::Scalar numReduce; statistics::Scalar registerFileCoalesce; statistics::Scalar registerShortage; + statistics::Scalar numUpdateRolls; statistics::Histogram vertexReadLatency; statistics::Histogram updateQueueLatency; From fd1561f7435537165a458e3aac7afded87904475 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 8 Nov 2022 19:47:35 -0800 Subject: [PATCH 216/247] Removing unnecessary comments. --- src/accl/graph/sega/coalesce_engine.cc | 52 +++----------------------- 1 file changed, 5 insertions(+), 47 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index b9ac25c502..98229dde24 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -86,6 +86,9 @@ CoalesceEngine::recvFunctional(PacketPtr pkt) int block_index = getBlockIndex(addr); // FIXME: Check postPushWBQueue for hits + // Is it really the case though. I don't think at this time + // beacuse we check done after handleMemResp and make sure all + // the writes to memory are done before scheduling an exit event if ((cacheBlocks[block_index].addr == addr) && (cacheBlocks[block_index].valid)) { assert(cacheBlocks[block_index].state == CacheState::IDLE); @@ -438,23 +441,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) activeBuffer.emplace_back(pkt, curTick()); stats.blockActiveCount.sample(count); stats.frontierSize.sample(directory->workCount()); - DPRINTF(MSDebug, "%s: Empalced pkt: %s in activeBuffer. " - "activeBuffer.size: %d.\n", __func__, - pkt->print(), activeBuffer.size()); } else { delete pkt; } - // if (workLeftInMem() && timeToPull() && canSchedulePull()) { - // memoryFunctionQueue.emplace_back( - // [this] (int ignore, Tick schedule_tick) { - // processNextVertexPull(ignore, schedule_tick); - // }, 0, curTick()); - // if ((!nextMemoryEvent.pending()) && - // (!nextMemoryEvent.scheduled())) { - // schedule(nextMemoryEvent, nextCycle()); - // } - // pullsScheduled++; - // } + if (pullCondition()) { memoryFunctionQueue.emplace_back( [this] (int ignore, Tick schedule_tick) { @@ -685,9 +675,6 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) need_send_pkt = false; wb = postPushWBQueue.erase(wb); delete wb_pkt; - DPRINTF(MSDebug, "%s: Found addr: %lu in postPushWBQueue. " - "postPushWBQueue.size: %d.\n", __func__, - cacheBlocks[block_index].addr, postPushWBQueue.size()); } else { wb++; } @@ -707,16 +694,6 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) need_send_pkt = false; ab = activeBuffer.erase(ab); delete ab_pkt; - // if (workLeftInMem() && timeToPull() && canSchedulePull()) { - // memoryFunctionQueue.emplace_back( - // [this] (int ignore, Tick schedule_tick) { - // processNextVertexPull(ignore, schedule_tick); - // }, 0, curTick()); - // pullsScheduled++; - // } - DPRINTF(MSDebug, "%s: Found addr: %lu in activeBuffer. " - "activeBuffer.size: %d.\n", __func__, - cacheBlocks[block_index].addr, activeBuffer.size()); if (pullCondition()) { memoryFunctionQueue.emplace_back( [this] (int ignore, Tick schedule_tick) { @@ -841,6 +818,7 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick) if (postPushWBQueue.empty()) { return; } + PacketPtr wb_pkt; Tick pkt_tick; std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front(); @@ -848,9 +826,6 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick) memPort.sendPacket(wb_pkt); onTheFlyReqs++; postPushWBQueue.pop_front(); - DPRINTF(MSDebug, "%s: Popped pkt: %s from postPushWBQueue. " - "postPushWBQueue.size: %d.\n", __func__, - wb_pkt->print(), postPushWBQueue.size()); } } @@ -958,13 +933,7 @@ CoalesceEngine::processNextApplyEvent() PacketPtr wb_pkt = createWritePacket(pkt->getAddr(), peerMemoryAtomSize, (uint8_t*) items); postPushWBQueue.emplace_back(wb_pkt, curTick()); - DPRINTF(MSDebug, "%s: Empalced pkt: %s in postPushWBQueue. " - "postPushWBQueue.size: %d.\n", __func__, - wb_pkt->print(), postPushWBQueue.size()); activeBuffer.pop_front(); - DPRINTF(MSDebug, "%s: Popped pkt: %s from activeBuffer. " - "activeBuffer.size: %d.\n", __func__, - pkt->print(), activeBuffer.size()); memoryFunctionQueue.emplace_back( [this] (int ignore, Tick schedule_tick) { processNextPostPushWB(ignore, schedule_tick); @@ -1020,17 +989,6 @@ CoalesceEngine::processNextApplyEvent() "work to apply.\n", __func__); } - // if (workLeftInMem() && timeToPull() && canSchedulePull()) { - // memoryFunctionQueue.emplace_back( - // [this] (int ignore, Tick schedule_tick) { - // processNextVertexPull(ignore, schedule_tick); - // }, 0, curTick()); - // if ((!nextMemoryEvent.pending()) && - // (!nextMemoryEvent.scheduled())) { - // schedule(nextMemoryEvent, nextCycle()); - // } - // pullsScheduled++; - // } if (pullCondition()) { memoryFunctionQueue.emplace_back( [this] (int ignore, Tick schedule_tick) { From 1124f5be5c9272df474387555d95f4e0603486c1 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Tue, 8 Nov 2022 22:17:20 -0800 Subject: [PATCH 217/247] Removing comments. --- src/accl/graph/sega/work_directory.hh | 103 ++++++++------------------ 1 file changed, 30 insertions(+), 73 deletions(-) diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh index 35778686c8..18430aee0d 100644 --- a/src/accl/graph/sega/work_directory.hh +++ b/src/accl/graph/sega/work_directory.hh @@ -29,6 +29,9 @@ #ifndef __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__ #define __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__ +#include + +#include "accl/graph/base/data_structs.hh" #include "base/addr_range.hh" #include "base/types.hh" @@ -63,9 +66,11 @@ class PopCountDirectory: public WorkDirectory int lastCounterIndex; uint32_t* popCount; - uint32_t currentIndex; + uint32_t prevIndex; uint32_t currentCounter; + UniqueFIFO activeBlockIndices; + int getIndexFromAtomAddr(Addr atom_addr) { assert((atom_addr % memoryAtomSize) == 0); @@ -86,7 +91,7 @@ class PopCountDirectory: public WorkDirectory WorkDirectory(), memoryRange(mem_range), numAtomsPerBlock(atoms_per_block), memoryAtomSize(atom_size), _workCount(0), - currentIndex(0), currentCounter(0) + prevIndex(-1), currentCounter(0) { blockSize = numAtomsPerBlock * memoryAtomSize; int numCounters = (int) (memoryRange.size() / blockSize); @@ -105,6 +110,7 @@ class PopCountDirectory: public WorkDirectory uint32_t prev_count = popCount[index]; popCount[index]++; _workCount++; + activeBlockIndices.push_back(index); assert(popCount[index] > prev_count); assert(popCount[index] <= numAtomsPerBlock); return popCount[index]; @@ -118,6 +124,9 @@ class PopCountDirectory: public WorkDirectory uint32_t prev_count = popCount[index]; popCount[index]--; _workCount--; + if (popCount[index] == 0) { + activeBlockIndices.erase(index); + } assert(popCount[index] < prev_count); assert(popCount[index] <= numAtomsPerBlock); return popCount[index]; @@ -130,80 +139,28 @@ class PopCountDirectory: public WorkDirectory lastCounterIndex = getIndexFromAtomAddr(atom_addr); } - // CAUTION: If this function returns an addr that - // is in the cache, that addr should be ignored. - // CAUTION: The receiver should track the last n - // addresses that this WorkDirectory has generated. - // where n is equal to the size of the entry holding - // reads generated by this WorkDirectory. In case - // the WorkDirectory generates a repeated address - // it should be ignored. - // FIXME: This should return garbage if it can't find anything. - // virtual Addr getNextWork() - // { - // if ((currentCounter == numAtomsPerBlock) || - // (popCount[currentIndex] == 0)) { - // int prev_index = currentIndex; - // while (true) { - // currentIndex++; - // // NOTE: this is an optimization. - // // lastCounterIndex tracks the last blockOfAtom that - // // has vertices. By default it is set to numCounters - 1. - // // However, it might not be necessary to track all the - // // numCounters counters. e.g. If this WorkDirectory is tracking - // // a 512 MiB memory with atom size of 32 B and 256 atoms - // // per block. Then it needs 64 Ki counters of 8 bit wide. - // // However, if we need 8 Mi atoms to store all our vertices, - // // the second half of the counters would not be used at all - // // (512 MiB hold 16 Mi atoms and we're only using half). - // if (currentIndex > lastCounterIndex) { - // currentIndex = 0; - // } - // if (prev_index == currentIndex) { - // // NOTE: If we have reached the same index as before, - // // we need to decrement the currentCounter to generate - // // a repeatative address. This way the receiver can detect - // // the uselessness of the generated address and ignore it - // currentCounter--; - // break; - // } - // if (popCount[currentIndex] > 0) { - // currentCounter = 0; - // break; - // } - // } - // } - // Addr ret_addr = getAtomAddrFromIndex(currentIndex, currentCounter); - // currentCounter++; - - // return ret_addr; - // } - + // CAUTION: This directory only tracks active vertices in the memory + // and it does not have any information on the state of the cache and/or + // the active buffer or the write buffer. Therefore, it might generate a + // read request to an address that might be in any of those. In that case, + // the generated address should be ignored. virtual Addr getNextWork() { - if ((currentCounter == numAtomsPerBlock) || - (popCount[currentIndex] == 0)) { - int other_count = _workCount - popCount[currentIndex]; - if (other_count == 0) { - currentCounter = 0; - } else { - int prev_index = currentIndex; - while (true) { - currentIndex++; - if (currentIndex > lastCounterIndex) { - currentIndex = 0; - } - if (currentIndex == prev_index) { - break; - } - if (popCount[currentIndex] > 0) { - break; - } - } - currentCounter = 0; - } + // Why ask directory if it's empty? + assert(!activeBlockIndices.empty()); + int front_index = activeBlockIndices.front(); + assert(popCount[front_index] > 0); + if ((prevIndex != -1) && (prevIndex != front_index)) { + currentCounter = 0; + } + if (currentCounter == numAtomsPerBlock) { + currentCounter = 0; + activeBlockIndices.pop_front(); + activeBlockIndices.push_back(front_index); } - Addr ret_addr = getAtomAddrFromIndex(currentIndex, currentCounter); + int current_index = activeBlockIndices.front(); + Addr ret_addr = getAtomAddrFromIndex(current_index, currentCounter); + prevIndex = current_index; currentCounter++; return ret_addr; } From c2b08a68d27767737a489c72e7fcf7d80be10bc2 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 9 Nov 2022 09:05:29 -0800 Subject: [PATCH 218/247] Adding pr and updating config scripts. --- configs/accl/bfs.py | 24 ++-- configs/accl/pr-sample.py | 109 -------------- configs/accl/pr.py | 44 +++++- configs/accl/sega.py | 36 +++-- src/accl/graph/base/graph_workload.cc | 157 +++++++++------------ src/accl/graph/base/graph_workload.hh | 38 ++--- src/accl/graph/sega/CenteralController.py | 2 +- src/accl/graph/sega/CoalesceEngine.py | 1 + src/accl/graph/sega/centeral_controller.cc | 10 +- src/accl/graph/sega/centeral_controller.hh | 2 +- src/accl/graph/sega/coalesce_engine.cc | 53 ++++--- src/accl/graph/sega/coalesce_engine.hh | 5 +- 12 files changed, 201 insertions(+), 280 deletions(-) delete mode 100644 configs/accl/pr-sample.py diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py index 80331e3aad..829449c599 100644 --- a/configs/accl/bfs.py +++ b/configs/accl/bfs.py @@ -40,20 +40,20 @@ def get_inputs(): argparser.add_argument("init_addr", type=int) argparser.add_argument("init_value", type=int) argparser.add_argument( - "--verify", - dest="verify", + "--sample", + dest="sample", action="store_const", const=True, default=False, - help="Print final answer", + help="Sample sim stats every 100us", ) argparser.add_argument( - "--sample", - dest="sample", + "--verify", + dest="verify", action="store_const", const=True, default=False, - help="Sample statistics", + help="Print final answer", ) args = argparser.parse_args() @@ -70,7 +70,15 @@ def get_inputs(): if __name__ == "__m5_main__": - num_gpts, cache_size, graph, init_addr, init_value, sample, verify = get_inputs() + ( + num_gpts, + cache_size, + graph, + init_addr, + init_value, + sample, + verify, + ) = get_inputs() system = SEGA(num_gpts, cache_size, graph) root = Root(full_system=False, system=system) @@ -81,7 +89,7 @@ def get_inputs(): system.create_bfs_workload(init_addr, init_value) if sample: while True: - exit_event = m5.simulate(10000000) + exit_event = m5.simulate(100000000) print( f"Exited simulation at tick {m5.curTick()} " + f"because {exit_event.getCause()}" diff --git a/configs/accl/pr-sample.py b/configs/accl/pr-sample.py deleted file mode 100644 index ac3616dc84..0000000000 --- a/configs/accl/pr-sample.py +++ /dev/null @@ -1,109 +0,0 @@ -# Copyright (c) 2022 The Regents of the University of California -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from sega import SEGA - -import m5 -import argparse - -from m5.objects import * - - -def get_inputs(): - argparser = argparse.ArgumentParser() - argparser.add_argument("num_gpts", type=int) - argparser.add_argument("cache_size", type=str) - argparser.add_argument("graph", type=str) - argparser.add_argument("alpha", type=float) - argparser.add_argument("threshold", type=float) - argparser.add_argument( - "--verify", - dest="verify", - action="store_const", - const=True, - default=False, - help="Print final answer", - ) - argparser.add_argument( - "--sample", - dest="sample", - action="store_const", - const=True, - default=False, - help="Sample sim stats every 10us", - ) - - args = argparser.parse_args() - - return ( - args.num_gpts, - args.cache_size, - args.graph, - args.alpha, - args.threshold, - args.verify, - args.sample, - ) - - -if __name__ == "__m5_main__": - ( - num_gpts, - cache_size, - graph, - alpha, - threshold, - verify, - sample, - ) = get_inputs() - - system = SEGA(num_gpts, cache_size, graph) - root = Root(full_system=False, system=system) - - m5.instantiate() - - system.create_pr_workload(alpha, threshold) - - if sample: - while True: - exit_event = m5.simulate(10000000) - print( - f"Exited simulation at tick {m5.curTick()} " - + f"because {exit_event.getCause()}" - ) - m5.stats.dump() - m5.stats.reset() - print(exit_event.getCause()) - if exit_event.getCause() != "simulate() limit reached": - break - else: - exit_event = m5.simulate() - print( - f"Exited simulation at tick {m5.curTick()} " - + f"because {exit_event.getCause()}" - ) - if verify: - system.print_answer() diff --git a/configs/accl/pr.py b/configs/accl/pr.py index 59e8b924c6..e852e47561 100644 --- a/configs/accl/pr.py +++ b/configs/accl/pr.py @@ -39,6 +39,14 @@ def get_inputs(): argparser.add_argument("graph", type=str) argparser.add_argument("alpha", type=float) argparser.add_argument("threshold", type=float) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) argparser.add_argument( "--verify", dest="verify", @@ -56,23 +64,45 @@ def get_inputs(): args.graph, args.alpha, args.threshold, + args.sample, args.verify, ) - if __name__ == "__m5_main__": - num_gpts, cache_size, graph, alpha, threshold, verify = get_inputs() + ( + num_gpts, + cache_size, + graph, + alpha, + threshold, + sample, + verify, + ) = get_inputs() system = SEGA(num_gpts, cache_size, graph) root = Root(full_system=False, system=system) m5.instantiate() + system.create_pop_count_directory(64) system.create_pr_workload(alpha, threshold) - exit_event = m5.simulate() - print( - f"Exited simulation at tick {m5.curTick()} " - + f"because {exit_event.getCause()}" - ) + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + print(exit_event.getCause()) + if exit_event.getCause() != "simulate() limit reached": + break + else: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) if verify: system.print_answer() diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 29a017ba65..7831302228 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -47,14 +47,18 @@ def interleave_addresses(plain_range, num_channels, cache_line_size): class GPT(SubSystem): - def __init__(self, edge_memory_size: str, cache_size: str): + def __init__( + self, edge_memory_size: str, cache_size: str, simple_mem: bool = False + ): super().__init__() + self._simple_mem = simple_mem self.wl_engine = WLEngine(update_queue_size=128, register_file_size=64) self.coalesce_engine = CoalesceEngine( attached_memory_atom_size=32, cache_size=cache_size, max_resp_per_cycle=8, - active_buffer_size = 64, + pending_pull_limit=32, + active_buffer_size=64, post_push_wb_queue_size=64, ) self.push_engine = PushEngine( @@ -65,9 +69,15 @@ def __init__(self, edge_memory_size: str, cache_size: str): update_queue_size=32, ) - self.vertex_mem_ctrl = HBMCtrl( - dram=HBM_2000_4H_1x64(), dram_2=HBM_2000_4H_1x64() - ) + if self._simple_mem: + self.vertex_mem_ctrl = SimpleMemory( + latency="122ns", latency_var="0ns", bandwidth="28GiB/s" + ) + else: + self.vertex_mem_ctrl = HBMCtrl( + dram=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96), + dram_2=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96) + ) self.edge_mem_ctrl = MemCtrl( dram=DDR4_2400_8x8( @@ -96,18 +106,20 @@ def setReqPort(self, port): self.push_engine.out_ports = port def set_vertex_range(self, vertex_ranges): - self.vertex_mem_ctrl.dram.range = vertex_ranges[0] - self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1] + if self._simple_mem: + self.vertex_mem_ctrl.range = vertex_ranges[0] + else: + self.vertex_mem_ctrl.dram.range = vertex_ranges[0] + self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1] def set_vertex_pch_bit(self, pch_bit): - self.vertex_mem_ctrl.pch_bit = pch_bit + if self._simple_mem: + pass + else: + self.vertex_mem_ctrl.pch_bit = pch_bit def set_edge_image(self, edge_image): self.edge_mem_ctrl.dram.image_file = edge_image - # def set_edge_image(self, edge_image): - # self.edge_mem_ctrl.image_file = edge_image - - class SEGA(System): def __init__(self, num_mpus, cache_size, graph_path): diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 446509201f..0539296cce 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -113,92 +113,75 @@ BFSWorkload::printWorkListItem(const WorkListItem wl) ); } -// PRWorkload::PRWorkload(float alpha, float threshold, int atom_size): -// GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size) -// { -// numElementsPerLine = (int) (atomSize / sizeof(WorkListItem)); -// } - -// void -// PRWorkload::init(PacketPtr pkt, int bit_index_base, -// std::bitset& needsPush, -// std::deque& activeBits, -// int& _workCount) -// { -// WorkListItem items[numElementsPerLine]; - -// pkt->writeDataToBlock((uint8_t*) items, atomSize); -// for (int i = 0; i < numElementsPerLine; i++) { -// items[i].tempProp = readFromFloat(0); -// items[i].prop = readFromFloat(1 - alpha); -// if (items[i].degree > 0) { -// needsPush[bit_index_base + i] = 1; -// activeBits.push_back(bit_index_base + i); -// _workCount++; -// } -// } -// pkt->deleteData(); -// pkt->allocate(); -// pkt->setDataFromBlock((uint8_t*) items, atomSize); -// } - -// uint32_t -// PRWorkload::reduce(uint32_t update, uint32_t value) -// { -// float update_float = writeToFloat(update); -// float value_float = writeToFloat(value); -// return readFromFloat(update_float + value_float); -// } - -// uint32_t -// PRWorkload::propagate(uint32_t value, uint32_t weight) -// { -// float value_float = writeToFloat(value); -// float weight_float = 1.0; - -// return readFromFloat(alpha * value_float * weight_float); -// } - -// bool -// PRWorkload::applyCondition(WorkListItem wl) -// { -// float temp_float = writeToFloat(wl.tempProp); -// float prop_float = writeToFloat(wl.prop); -// float dist = std::abs(temp_float - prop_float); -// return dist >= threshold; -// } - -// bool -// PRWorkload::preWBApply(WorkListItem& wl) -// { -// if (applyCondition(wl) && (wl.degree > 0)) { -// return true; -// } -// return false; -// } - -// std::tuple -// PRWorkload::apply(WorkListItem& wl) -// { -// if (applyCondition(wl)) { -// float temp_float = writeToFloat(wl.tempProp); -// float prop_float = writeToFloat(wl.prop); -// float delta = (temp_float - prop_float) / wl.degree; -// uint32_t delta_uint = readFromFloat(delta); -// wl.prop = wl.tempProp; -// return std::make_tuple(delta_uint, true, true); -// } -// return std::make_tuple(0, false, false); -// } - -// std::string -// PRWorkload::printWorkListItem(const WorkListItem wl) -// { -// float temp_float = writeToFloat(wl.tempProp); -// return csprintf( -// "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}", -// temp_float, temp_float, wl.degree, wl.edgeIndex -// ); -// } +void +PRWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + size_t pkt_size = pkt->getSize(); + int num_elements = (int) (pkt_size / sizeof(WorkListItem)); + WorkListItem items[num_elements]; + + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + bool atom_active = false; + for (int i = 0; i < num_elements; i++) { + items[i].tempProp = readFromFloat(0); + items[i].prop = readFromFloat(1 - alpha); + atom_active |= activeCondition(items[i]); + } + if (atom_active) { + dir->activate(pkt->getAddr()); + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); +} + +uint32_t +PRWorkload::reduce(uint32_t update, uint32_t value) +{ + float update_float = writeToFloat(update); + float value_float = writeToFloat(value); + return readFromFloat(update_float + value_float); +} + +uint32_t +PRWorkload::propagate(uint32_t value, uint32_t weight) +{ + float value_float = writeToFloat(value); + float weight_float = writeToFloat(weight); + if (weight == 0) { + weight_float = 1.0; + } + return readFromFloat(alpha * value_float * weight_float); +} + +bool +PRWorkload::activeCondition(WorkListItem wl) +{ + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + float dist = std::abs(temp_float - prop_float); + return dist >= threshold; +} + +uint32_t +PRWorkload::apply(WorkListItem& wl) +{ + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + float delta = (temp_float - prop_float) / wl.degree; + uint32_t delta_uint = readFromFloat(delta); + wl.prop = wl.tempProp; + return delta_uint; +} + +std::string +PRWorkload::printWorkListItem(const WorkListItem wl) +{ + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + return csprintf( + "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}", + temp_float, prop_float, wl.degree, wl.edgeIndex); +} } // namespace gem5 diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh index f71955bd16..f335ad9b47 100644 --- a/src/accl/graph/base/graph_workload.hh +++ b/src/accl/graph/base/graph_workload.hh @@ -77,24 +77,26 @@ class BFSWorkload : public GraphWorkload }; -// class PRWorkload : public GraphWorkload -// { -// private: -// float alpha; -// float threshold; - -// public: -// PRWorkload(float alpha, float threshold); - -// ~PRWorkload() {} - -// virtual void init(PacketPtr pkt, WorkDirectory* dir); -// virtual uint32_t reduce(uint32_t update, uint32_t value); -// virtual uint32_t propagate(uint32_t value, uint32_t weight); -// virtual uint32_t apply(WorkListItem& wl); -// virtual bool activeCondition(WorkListItem wl); -// virtual std::string printWorkListItem(const WorkListItem wl); -// }; +class PRWorkload : public GraphWorkload +{ + private: + float alpha; + float threshold; + + public: + PRWorkload(float alpha, float threshold): + alpha(alpha), threshold(threshold) + {} + + ~PRWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual bool activeCondition(WorkListItem wl); + virtual std::string printWorkListItem(const WorkListItem wl); +}; } diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index 0c21833a05..09a997696d 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -43,6 +43,6 @@ class CenteralController(ClockedObject): cxx_exports = [ PyBindMethod("createBFSWorkload"), - # PyBindMethod("createPRWorkload"), + PyBindMethod("createPRWorkload"), PyBindMethod("printAnswerToHostSimout") ] diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index 76e7d262e8..c2393c2f1e 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -39,6 +39,7 @@ class CoalesceEngine(BaseMemoryEngine): max_resp_per_cycle = Param.Int("Maximum number of vertices to send to " "requestor in each cycle. Used to limit b/w.") + pending_pull_limit = Param.Int("Maximum number of pending pull processes.") active_buffer_size = Param.Int("Maximum number of memory active memory " "atoms ready to send updates. This parameter " "and post_push_wb_queue_size should be set " diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 883992e64e..60c78559e4 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -110,11 +110,11 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value) workload = new BFSWorkload(init_addr, init_value); } -// void -// CenteralController::createPRWorkload(float alpha, float threshold) -// { -// workload = new PRWorkload(alpha, threshold, system->cacheLineSize()); -// } +void +CenteralController::createPRWorkload(float alpha, float threshold) +{ + workload = new PRWorkload(alpha, threshold); +} void CenteralController::recvDoneSignal() diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index 6eb07dbcac..ae2980d050 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -63,7 +63,7 @@ class CenteralController : public ClockedObject virtual void startup() override; void createBFSWorkload(Addr init_addr, uint32_t init_value); - // void createPRWorkload(float alpha, float threshold); + void createPRWorkload(float alpha, float threshold); void recvDoneSignal(); diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 98229dde24..8ac40198be 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -46,10 +46,10 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): BaseMemoryEngine(params), lastAtomAddr(0), numLines((int) (params.cache_size / peerMemoryAtomSize)), numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), - onTheFlyReqs(0), - maxRespPerCycle(params.max_resp_per_cycle), - pullsReceived(0), pullsScheduled(0), pendingPullReads(0), - activeBufferSize(params.active_buffer_size), + onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle), + pullsReceived(0), pullsScheduled(0), + pendingPullLimit(params.pending_pull_limit), + pendingPullReads(0), activeBufferSize(params.active_buffer_size), postPushWBQueueSize(params.post_push_wb_queue_size), nextMemoryEvent([this] { processNextMemoryEvent(); @@ -129,29 +129,17 @@ CoalesceEngine::done() } bool -CoalesceEngine::timeToPull() +CoalesceEngine::enoughSpace() { - return (activeBuffer.size() + pendingPullReads) < activeBufferSize; -} - -bool -CoalesceEngine::canSchedulePull() -{ - // TODO: Maybe a good idea to change this to - // activeBuffer.size() + pendingPullReads + pullsScheduled < activeBufferSize - return pullsScheduled < 1; -} - -bool -CoalesceEngine::workLeftInMem() -{ - return !directory->empty(); + return (activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize; } bool CoalesceEngine::pullCondition() { - return ((activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize); + bool enough_space = enoughSpace(); + bool schedule_limit = pullsScheduled < pendingPullLimit; + return enough_space && schedule_limit; } // addr should be aligned to peerMemoryAtomSize @@ -784,12 +772,6 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) atom_active |= graphWorkload->activeCondition( cacheBlocks[block_index].items[index]); } - if (atom_active) { - activeCacheBlocks.erase(block_index); - int count = directory->activate(cacheBlocks[block_index].addr); - stats.blockActiveCount.sample(count); - stats.frontierSize.sample(directory->workCount()); - } PacketPtr pkt = createWritePacket( cacheBlocks[block_index].addr, peerMemoryAtomSize, @@ -797,8 +779,21 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) DPRINTF(CoalesceEngine, "%s: Created a write packet to " "Addr: %lu, size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); - memPort.sendPacket(pkt); - onTheFlyReqs++; + if (atom_active) { + activeCacheBlocks.erase(block_index); + if (enoughSpace()) { + activeBuffer.emplace_back(pkt, curTick()); + } else { + int count = directory->activate(cacheBlocks[block_index].addr); + stats.blockActiveCount.sample(count); + stats.frontierSize.sample(directory->workCount()); + memPort.sendPacket(pkt); + onTheFlyReqs++; + } + } else { + memPort.sendPacket(pkt); + onTheFlyReqs++; + } cacheBlocks[block_index].reset(); DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index c2da6a90cd..f605704b6d 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -118,6 +118,7 @@ class CoalesceEngine : public BaseMemoryEngine UniqueFIFO activeCacheBlocks; int pullsScheduled; + int pendingPullLimit; int pendingPullReads; // A map from addr to sendMask. sendMask determines which bytes to // send for push when getting the read response from memory. @@ -128,9 +129,7 @@ class CoalesceEngine : public BaseMemoryEngine std::deque> activeBuffer; std::deque> postPushWBQueue; - bool timeToPull(); - bool canSchedulePull(); - bool workLeftInMem(); + bool enoughSpace(); bool pullCondition(); int getBlockIndex(Addr addr); From ccaa539854ee30fc4ea9e6289968ddcf9700edf1 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 9 Nov 2022 21:24:39 -0800 Subject: [PATCH 219/247] Updating activeCondition for PR. --- src/accl/graph/base/graph_workload.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 0539296cce..05c8d05089 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -160,7 +160,7 @@ PRWorkload::activeCondition(WorkListItem wl) float temp_float = writeToFloat(wl.tempProp); float prop_float = writeToFloat(wl.prop); float dist = std::abs(temp_float - prop_float); - return dist >= threshold; + return (dist >= threshold) && (wl.degree > 0); } uint32_t From 3747d9f40e7dd23a7e958621090b02ba58cd79c9 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Sun, 13 Nov 2022 15:36:40 -0800 Subject: [PATCH 220/247] Adding SSSP and CC --- src/accl/graph/base/graph_workload.cc | 172 ++++++++++++++++++++++++++ src/accl/graph/base/graph_workload.hh | 58 +++++++++ 2 files changed, 230 insertions(+) diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 05c8d05089..e36c074da9 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -113,6 +113,121 @@ BFSWorkload::printWorkListItem(const WorkListItem wl) ); } +void +BFSVisitedWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + size_t pkt_size = pkt->getSize(); + uint64_t aligned_addr = roundDown(initAddr, pkt_size); + + if (pkt->getAddr() == aligned_addr) { + int num_elements = (int) (pkt_size / sizeof(WorkListItem)); + WorkListItem items[num_elements]; + + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + + int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem)); + items[index].tempProp = initValue; + if (activeCondition(items[index])) { + dir->activate(aligned_addr); + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); + } +} + +uint32_t +BFSVisitedWorkload::reduce(uint32_t update, uint32_t value) +{ + return std::min(update, value); +} + +uint32_t +BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) +{ + return 1; +} + +bool +BFSVisitedWorkload::activeCondition(WorkListItem wl) +{ + return (wl.tempProp < wl.prop) && (wl.degree > 0); +} + +uint32_t +BFSVisitedWorkload::apply(WorkListItem& wl) +{ + wl.prop = wl.tempProp; + return wl.prop; +} + +std::string +BFSVisitedWorkload::printWorkListItem(const WorkListItem wl) +{ + return csprintf( + "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}", + wl.tempProp, wl.prop, wl.degree, wl.edgeIndex + ); +} + +void +SSSPWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + size_t pkt_size = pkt->getSize(); + uint64_t aligned_addr = roundDown(initAddr, pkt_size); + + if (pkt->getAddr() == aligned_addr) { + int num_elements = (int) (pkt_size / sizeof(WorkListItem)); + WorkListItem items[num_elements]; + + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + + int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem)); + items[index].tempProp = initValue; + if (activeCondition(items[index])) { + dir->activate(aligned_addr); + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); + } +} + +uint32_t +SSSPWorkload::reduce(uint32_t update, uint32_t value) +{ + return std::min(update, value); +} + +uint32_t +SSSPWorkload::propagate(uint32_t value, uint32_t weight) +{ + return value + weight; +} + +bool +SSSPWorkload::activeCondition(WorkListItem wl) +{ + return (wl.tempProp < wl.prop) && (wl.degree > 0); +} + +uint32_t +SSSPWorkload::apply(WorkListItem& wl) +{ + wl.prop = wl.tempProp; + return wl.prop; +} + +std::string +SSSPWorkload::printWorkListItem(const WorkListItem wl) +{ + return csprintf( + "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}", + wl.tempProp, wl.prop, wl.degree, wl.edgeIndex + ); +} + + void PRWorkload::init(PacketPtr pkt, WorkDirectory* dir) { @@ -184,4 +299,61 @@ PRWorkload::printWorkListItem(const WorkListItem wl) temp_float, prop_float, wl.degree, wl.edgeIndex); } +void +CCWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + Addr pkt_addr = pkt->getAddr(); + size_t pkt_size = pkt->getSize(); + int num_elements = (int) (pkt_size / sizeof(WorkListItem)); + WorkListItem items[num_elements]; + + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + bool atom_active = false; + for (int i = 0; i < num_elements; i++) { + items[i].tempProp = (int) ( pkt_addr / sizeof(WorkListItem)) + i; + items[i].prop = -1; + atom_active |= activeCondition(items[i]); + } + if (atom_active) { + dir->activate(pkt->getAddr()); + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); +} + +uint32_t +CCWorkload::reduce(uint32_t update, uint32_t value) +{ + return std::min(update, value); +} + +uint32_t +CCWorkload::propagate(uint32_t value, uint32_t weight) +{ + return value; +} + +bool +CCWorkload::activeCondition(WorkListItem wl) +{ + return (wl.tempProp < wl.prop) && (wl.degree > 0); +} + +uint32_t +CCWorkload::apply(WorkListItem& wl) +{ + wl.prop = wl.tempProp; + return wl.prop; +} + +std::string +CCWorkload::printWorkListItem(const WorkListItem wl) +{ + return csprintf( + "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}", + wl.tempProp, wl.prop, wl.degree, wl.edgeIndex + ); +} + } // namespace gem5 diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh index f335ad9b47..de2877d6e8 100644 --- a/src/accl/graph/base/graph_workload.hh +++ b/src/accl/graph/base/graph_workload.hh @@ -76,6 +76,48 @@ class BFSWorkload : public GraphWorkload virtual std::string printWorkListItem(const WorkListItem wl); }; +class BFSVisitedWorkload : public GraphWorkload +{ + private: + uint64_t initAddr; + uint32_t initValue; + + public: + BFSVisitedWorkload(uint64_t init_addr, uint32_t init_value): + initAddr(init_addr), initValue(init_value) + {} + + ~BFSVisitedWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual bool activeCondition(WorkListItem wl); + virtual std::string printWorkListItem(const WorkListItem wl); +}; + +class SSSPWorkload : public GraphWorkload +{ + private: + uint64_t initAddr; + uint32_t initValue; + + public: + SSSPWorkload(uint64_t init_addr, uint32_t init_value): + initAddr(init_addr), initValue(init_value) + {} + + ~SSSPWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual bool activeCondition(WorkListItem wl); + virtual std::string printWorkListItem(const WorkListItem wl); +}; + class PRWorkload : public GraphWorkload { @@ -98,6 +140,22 @@ class PRWorkload : public GraphWorkload virtual std::string printWorkListItem(const WorkListItem wl); }; +class CCWorkload : public GraphWorkload +{ + + public: + CCWorkload() {} + + ~CCWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual bool activeCondition(WorkListItem wl); + virtual std::string printWorkListItem(const WorkListItem wl); +}; + } #endif // __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__ From 000103e41a94e4baf407eca22e44c3aabb0fe972 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Fri, 11 Nov 2022 14:40:50 -0800 Subject: [PATCH 221/247] Adding option to use SimpleMemory for vertex memory. --- configs/accl/bfs.py | 17 ++- configs/accl/pr.py | 20 ++- configs/accl/real-graph-gen.py | 16 ++- configs/accl/sega.py | 34 ++--- .../accl/{sega-simple.py => sega_simple.py} | 133 ++++++++---------- 5 files changed, 113 insertions(+), 107 deletions(-) rename configs/accl/{sega-simple.py => sega_simple.py} (50%) diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py index 829449c599..806aa8a915 100644 --- a/configs/accl/bfs.py +++ b/configs/accl/bfs.py @@ -24,7 +24,6 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from sega import SEGA import m5 import argparse @@ -39,6 +38,14 @@ def get_inputs(): argparser.add_argument("graph", type=str) argparser.add_argument("init_addr", type=int) argparser.add_argument("init_value", type=int) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) argparser.add_argument( "--sample", dest="sample", @@ -64,6 +71,7 @@ def get_inputs(): args.graph, args.init_addr, args.init_value, + args.simple, args.sample, args.verify, ) @@ -76,10 +84,15 @@ def get_inputs(): graph, init_addr, init_value, + simple, sample, verify, ) = get_inputs() - + + if simple: + from sega_simple import SEGA + else: + from sega import SEGA system = SEGA(num_gpts, cache_size, graph) root = Root(full_system=False, system=system) diff --git a/configs/accl/pr.py b/configs/accl/pr.py index e852e47561..e3d7c764ad 100644 --- a/configs/accl/pr.py +++ b/configs/accl/pr.py @@ -24,7 +24,6 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from sega import SEGA import m5 import argparse @@ -39,6 +38,14 @@ def get_inputs(): argparser.add_argument("graph", type=str) argparser.add_argument("alpha", type=float) argparser.add_argument("threshold", type=float) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) argparser.add_argument( "--sample", dest="sample", @@ -64,10 +71,12 @@ def get_inputs(): args.graph, args.alpha, args.threshold, + args.simple, args.sample, args.verify, ) + if __name__ == "__m5_main__": ( num_gpts, @@ -75,10 +84,15 @@ def get_inputs(): graph, alpha, threshold, + simple, sample, verify, ) = get_inputs() - + + if simple: + from sega_simple import SEGA + else: + from sega import SEGA system = SEGA(num_gpts, cache_size, graph) root = Root(full_system=False, system=system) @@ -95,7 +109,6 @@ def get_inputs(): ) m5.stats.dump() m5.stats.reset() - print(exit_event.getCause()) if exit_event.getCause() != "simulate() limit reached": break else: @@ -106,3 +119,4 @@ def get_inputs(): ) if verify: system.print_answer() + diff --git a/configs/accl/real-graph-gen.py b/configs/accl/real-graph-gen.py index b943a925c1..332bb67452 100644 --- a/configs/accl/real-graph-gen.py +++ b/configs/accl/real-graph-gen.py @@ -45,8 +45,11 @@ def get_inputs(): if __name__ == "__main__": graph_path, num_gpts = get_inputs() + graph_sorter = os.environ.get("GRAPH_SORTER") graph_reader = os.environ.get("GRAPH_READER") + if graph_sorter is None: + raise ValueError(f"No value for $GRAPH_SORTER.") if graph_reader is None: raise ValueError(f"No value for $GRAPH_READER.") @@ -54,6 +57,17 @@ def get_inputs(): raise ValueError(f"{graph_path} does not exist.") graph_dir = os.path.dirname(graph_path) + sorted_graph = f"{graph_dir}/sorted_graph.txt" + if not os.path.exists(sorted_graph): + print(f"Sorting {graph_path} into {sorted_graph}.") + subprocess.run( + [ + "python", + f"{graph_sorter}", + f"{graph_path}", + f"{sorted_graph}", + ] + ) if not "binaries" in os.listdir(graph_dir): print(f"binaries directory not found in {graph_dir}") os.mkdir(f"{graph_dir}/binaries") @@ -80,7 +94,7 @@ def get_inputs(): subprocess.run( [ f"{graph_reader}", - f"{graph_path}", + f"{sorted_graph}", "false", f"{num_gpts}", "32", diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 7831302228..1ea36ea49e 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -48,11 +48,9 @@ def interleave_addresses(plain_range, num_channels, cache_line_size): class GPT(SubSystem): def __init__( - self, edge_memory_size: str, cache_size: str, simple_mem: bool = False - ): + self, edge_memory_size: str, cache_size: str): super().__init__() - self._simple_mem = simple_mem - self.wl_engine = WLEngine(update_queue_size=128, register_file_size=64) + self.wl_engine = WLEngine(update_queue_size=64, register_file_size=64) self.coalesce_engine = CoalesceEngine( attached_memory_atom_size=32, cache_size=cache_size, @@ -69,20 +67,14 @@ def __init__( update_queue_size=32, ) - if self._simple_mem: - self.vertex_mem_ctrl = SimpleMemory( - latency="122ns", latency_var="0ns", bandwidth="28GiB/s" - ) - else: - self.vertex_mem_ctrl = HBMCtrl( - dram=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96), - dram_2=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96) - ) + self.vertex_mem_ctrl = HBMCtrl( + dram=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96), + dram_2=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96) + ) self.edge_mem_ctrl = MemCtrl( dram=DDR4_2400_8x8( - range=AddrRange(edge_memory_size), in_addr_map=False - ) + range=AddrRange(edge_memory_size), in_addr_map=False) ) self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port self.push_engine.mem_port = self.edge_mem_ctrl.port @@ -106,17 +98,11 @@ def setReqPort(self, port): self.push_engine.out_ports = port def set_vertex_range(self, vertex_ranges): - if self._simple_mem: - self.vertex_mem_ctrl.range = vertex_ranges[0] - else: - self.vertex_mem_ctrl.dram.range = vertex_ranges[0] - self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1] + self.vertex_mem_ctrl.dram.range = vertex_ranges[0] + self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1] def set_vertex_pch_bit(self, pch_bit): - if self._simple_mem: - pass - else: - self.vertex_mem_ctrl.pch_bit = pch_bit + self.vertex_mem_ctrl.pch_bit = pch_bit def set_edge_image(self, edge_image): self.edge_mem_ctrl.dram.image_file = edge_image diff --git a/configs/accl/sega-simple.py b/configs/accl/sega_simple.py similarity index 50% rename from configs/accl/sega-simple.py rename to configs/accl/sega_simple.py index 7ec19c92ae..f59fa71a79 100644 --- a/configs/accl/sega-simple.py +++ b/configs/accl/sega_simple.py @@ -24,90 +24,87 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import m5 -import argparse - from math import log from m5.objects import * + def interleave_addresses(plain_range, num_channels, cache_line_size): intlv_low_bit = log(cache_line_size, 2) intlv_bits = log(num_channels, 2) ret = [] for i in range(num_channels): - ret.append(AddrRange( - start=plain_range.start, - size=plain_range.size(), - intlvHighBit=intlv_low_bit + intlv_bits - 1, - xorHighBit=0, - intlvBits=intlv_bits, - intlvMatch=i)) + ret.append( + AddrRange( + start=plain_range.start, + size=plain_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + xorHighBit=0, + intlvBits=intlv_bits, + intlvMatch=i, + ) + ) return ret + class GPT(SubSystem): - def __init__(self, edge_memory_size: str, cache_size: str): + def __init__( + self, edge_memory_size: str, cache_size: str): super().__init__() - self.wl_engine = WLEngine( - update_queue_size=128, - register_file_size=64 - ) + self.wl_engine = WLEngine(update_queue_size=64, register_file_size=64) self.coalesce_engine = CoalesceEngine( - attached_memory_atom_size=32, - cache_size=cache_size, - num_mshr_entry=64, - num_tgts_per_mshr=64, - max_resp_per_cycle=8 - ) + attached_memory_atom_size=32, + cache_size=cache_size, + max_resp_per_cycle=8, + pending_pull_limit=32, + active_buffer_size=64, + post_push_wb_queue_size=64, + ) self.push_engine = PushEngine( - push_req_queue_size=32, - attached_memory_atom_size=64, - resp_queue_size=64, - update_queue_size=32, - ) - - self.vertex_mem_ctrl = SimpleMemory( - latency="0ns", - latency_var="0ns", - bandwidth="0GB/s" - ) - - self.edge_mem_ctrl = SimpleMemory( - latency="30ns", - latency_var="0ns", - bandwidth="32GB/s", - range=AddrRange(edge_memory_size), - in_addr_map=False - ) - + push_req_queue_size=32, + attached_memory_atom_size=64, + resp_queue_size=4096, + max_propagates_per_cycle=8, + update_queue_size=32, + ) + + self.vertex_mem_ctrl = SimpleMemory(latency="122ns", latency_var="0ns", bandwidth="28GiB/s") + + self.edge_mem_ctrl = MemCtrl( + dram=DDR4_2400_8x8( + range=AddrRange(edge_memory_size), in_addr_map=False) + ) self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port self.push_engine.mem_port = self.edge_mem_ctrl.port self.mpu = MPU( - wl_engine=self.wl_engine, - coalesce_engine=self.coalesce_engine, - push_engine=self.push_engine - ) + wl_engine=self.wl_engine, + coalesce_engine=self.coalesce_engine, + push_engine=self.push_engine, + ) def getRespPort(self): return self.wl_engine.in_ports + def setRespPort(self, port): self.wl_engine.in_ports = port def getReqPort(self): return self.push_engine.out_ports + def setReqPort(self, port): self.push_engine.out_ports = port def set_vertex_range(self, vertex_range): self.vertex_mem_ctrl.range = vertex_range + def set_edge_image(self, edge_image): - self.edge_mem_ctrl.image_file = edge_image + self.edge_mem_ctrl.dram.image_file = edge_image class SEGA(System): def __init__(self, num_mpus, cache_size, graph_path): super(SEGA, self).__init__() self.clk_domain = SrcClockDomain() - self.clk_domain.clock = '2GHz' + self.clk_domain.clock = "2GHz" self.clk_domain.voltage_domain = VoltageDomain() self.cache_line_size = 32 self.mem_mode = "timing" @@ -115,14 +112,12 @@ def __init__(self, num_mpus, cache_size, graph_path): self.ctrl = CenteralController(image_file=f"{graph_path}/vertices") vertex_ranges = interleave_addresses( - AddrRange(start=0, size="4GiB"), - num_mpus, - 32 - ) + AddrRange(start=0, size="4GiB"), num_mpus, 32 + ) gpts = [] for i in range(num_mpus): - gpt = GPT("8GiB", cache_size) + gpt = GPT("4GiB", cache_size) gpt.set_vertex_range(vertex_ranges[i]) gpt.set_edge_image(f"{graph_path}/edgelist_{i}") gpts.append(gpt) @@ -134,32 +129,16 @@ def __init__(self, num_mpus, cache_size, graph_path): self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] - def create_initial_bfs_update(self, init_addr, init_value): - self.ctrl.createInitialBFSUpdate(init_addr, init_value) - -def get_inputs(): - argparser = argparse.ArgumentParser() - argparser.add_argument("num_gpts", type=int) - argparser.add_argument("cache_size", type=str) - argparser.add_argument("graph", type=str) - argparser.add_argument("init_addr", type=int) - argparser.add_argument("init_value", type=int) - - args = argparser.parse_args() - - return args.num_gpts, args.cache_size, \ - args.graph, args.init_addr, args.init_value - -if __name__ == "__m5_main__": - num_gpts, cache_size, graph, init_addr, init_value = get_inputs() + def create_pop_count_directory(self, atoms_per_block): + for gpt in self.gpts: + gpt.coalesce_engine.createPopCountDirectory(atoms_per_block) - system = SEGA(num_gpts, cache_size, graph) - root = Root(full_system = False, system = system) + def create_bfs_workload(self, init_addr, init_value): + self.ctrl.createBFSWorkload(init_addr, init_value) - m5.instantiate() + def create_pr_workload(self, alpha, threshold): + self.ctrl.createPRWorkload(alpha, threshold) - system.create_initial_bfs_update(init_addr, init_value) + def print_answer(self): + self.ctrl.printAnswerToHostSimout() - exit_event = m5.simulate() - print(f"Exited simulation at tick {m5.curTick()} " + \ - f"because {exit_event.getCause()}") From 4b30d61b3a7b5261973467c478d2243da896d83b Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 12 Nov 2022 12:55:54 -0800 Subject: [PATCH 222/247] Removing graph gen scripts and moved to sega-utils. --- configs/accl/real-graph-gen.py | 107 ------------------------ configs/accl/synth-graph-gen.py | 139 -------------------------------- 2 files changed, 246 deletions(-) delete mode 100644 configs/accl/real-graph-gen.py delete mode 100644 configs/accl/synth-graph-gen.py diff --git a/configs/accl/real-graph-gen.py b/configs/accl/real-graph-gen.py deleted file mode 100644 index 332bb67452..0000000000 --- a/configs/accl/real-graph-gen.py +++ /dev/null @@ -1,107 +0,0 @@ -# Copyright (c) 2022 The Regents of the University of California -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import os -import argparse -import subprocess - - -def get_inputs(): - argparser = argparse.ArgumentParser() - argparser.add_argument("path", type=str, help="Path to the graph file.") - argparser.add_argument( - "num_gpts", - type=int, - help="Number gpts to create synth graph binaries for.", - ) - - args = argparser.parse_args() - return args.path, args.num_gpts - - -if __name__ == "__main__": - graph_path, num_gpts = get_inputs() - - graph_sorter = os.environ.get("GRAPH_SORTER") - graph_reader = os.environ.get("GRAPH_READER") - - if graph_sorter is None: - raise ValueError(f"No value for $GRAPH_SORTER.") - if graph_reader is None: - raise ValueError(f"No value for $GRAPH_READER.") - - if not os.path.exists(graph_path): - raise ValueError(f"{graph_path} does not exist.") - - graph_dir = os.path.dirname(graph_path) - sorted_graph = f"{graph_dir}/sorted_graph.txt" - if not os.path.exists(sorted_graph): - print(f"Sorting {graph_path} into {sorted_graph}.") - subprocess.run( - [ - "python", - f"{graph_sorter}", - f"{graph_path}", - f"{sorted_graph}", - ] - ) - if not "binaries" in os.listdir(graph_dir): - print(f"binaries directory not found in {graph_dir}") - os.mkdir(f"{graph_dir}/binaries") - print(f"Created {graph_dir}/binaries") - - if not f"gpts_{num_gpts}" in os.listdir(f"{graph_dir}/binaries"): - print(f"gpts_{num_gpts} not found in {graph_dir}/binaries") - os.mkdir(f"{graph_dir}/binaries/gpts_{num_gpts}") - print(f"Created {graph_dir}/binaries/gpts_{num_gpts}") - - expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)] - if not all( - [ - binary in os.listdir(f"{graph_dir}/binaries/gpts_{num_gpts}") - for binary in expected_bins - ] - ): - print( - f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}" - ) - for delete in os.scandir(f"{graph_dir}/binaries/gpts_{num_gpts}"): - os.remove(delete.path) - print(f"Deleted all the files in {graph_dir}/binaries/gpts_{num_gpts}") - subprocess.run( - [ - f"{graph_reader}", - f"{sorted_graph}", - "false", - f"{num_gpts}", - "32", - f"{graph_dir}/binaries/gpts_{num_gpts}", - ] - ) - print( - f"Created the graph binaries in " - f"{graph_dir}/binaries/gpts_{num_gpts}" - ) diff --git a/configs/accl/synth-graph-gen.py b/configs/accl/synth-graph-gen.py deleted file mode 100644 index 15e4a6eff2..0000000000 --- a/configs/accl/synth-graph-gen.py +++ /dev/null @@ -1,139 +0,0 @@ -# Copyright (c) 2022 The Regents of the University of California -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import os -import argparse -import subprocess - - -def get_inputs(): - argparser = argparse.ArgumentParser() - argparser.add_argument( - "scale", type=int, help="The scale of the synth graph to generate." - ) - argparser.add_argument( - "deg", - type=int, - help="The average degree of the synth graph to generate.", - ) - argparser.add_argument( - "num_gpts", - type=int, - help="Number gpts to create synth graph binaries for.", - ) - - args = argparser.parse_args() - return args.scale, args.deg, args.num_gpts - - -if __name__ == "__main__": - scale, deg, num_gpts = get_inputs() - - base_dir = os.environ.get("GRAPH_DIR", default="/tmp") - graph_gen = os.environ.get("GRAPH_GEN") - graph_reader = os.environ.get("GRAPH_READER") - graph_sorter = os.environ.get("GRAPH_SORTER") - if graph_gen is None: - raise ValueError(f"No value for $GRAPH_GEN.") - if graph_reader is None: - raise ValueError(f"No value for $GRAPH_READER.") - if graph_sorter is None: - raise ValueError(f"No value for $GRAPH_SORTER") - - graph_path = os.path.join(base_dir, f"graph_{scale}_{deg}") - if not os.path.exists(graph_path): - print(f"{graph_path} does not exist already.") - os.mkdir(graph_path) - print(f"Created {graph_path}") - - if not "graph.txt" in os.listdir(graph_path): - print(f"graph.txt not found in {graph_path}") - for delete in os.scandir(graph_path): - os.remove(delete.path) - print(f"Deleted everything in {graph_path}") - subprocess.run( - [ - f"{graph_gen}", - f"{scale}", - f"{deg}", - f"{graph_path}/graph_unordered.txt", - ] - ) - print(f"Generated a graph with scale " f"{scale} and deg {deg}") - subprocess.run( - [ - "python", - f"{graph_sorter}", - f"{graph_path}/graph_unordered.txt", - f"{graph_path}/graph.txt", - ] - ) - print( - f"Sorted the graph here {graph_path}/graph_unordered.txt" - f" and saved in {graph_path}/graph.txt" - ) - subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"]) - print(f"Deleted {graph_path}/graph_unordered.txt") - - if not "binaries" in os.listdir(graph_path): - print(f"binaries directory not found in {graph_path}") - os.mkdir(f"{graph_path}/binaries") - print(f"Created {graph_path}/binaries") - - if not f"gpts_{num_gpts}" in os.listdir(f"{graph_path}/binaries"): - print(f"gpts_{num_gpts} not found in {graph_path}/binaries") - os.mkdir(f"{graph_path}/binaries/gpts_{num_gpts}") - print(f"Created {graph_path}/binaries/gpts_{num_gpts}") - - expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)] - if not all( - [ - binary in os.listdir(f"{graph_path}/binaries/gpts_{num_gpts}") - for binary in expected_bins - ] - ): - print( - f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}" - ) - for delete in os.scandir(f"{graph_path}/binaries/gpts_{num_gpts}"): - os.remove(delete.path) - print( - f"Deleted all the files in {graph_path}/binaries/gpts_{num_gpts}" - ) - subprocess.run( - [ - f"{graph_reader}", - f"{graph_path}/graph.txt", - "false", - f"{num_gpts}", - "32", - f"{graph_path}/binaries/gpts_{num_gpts}", - ] - ) - print( - f"Created the graph binaries in " - f"{graph_path}/binaries/gpts_{num_gpts}" - ) From e10ce6142d0a7e255121d14a2eefe2715756bc1c Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 12 Nov 2022 12:57:19 -0800 Subject: [PATCH 223/247] Adding BSP mode. --- src/accl/graph/base/data_structs.hh | 30 ++- src/accl/graph/base/graph_workload.hh | 2 +- src/accl/graph/sega/CenteralController.py | 3 + src/accl/graph/sega/CoalesceEngine.py | 3 - src/accl/graph/sega/centeral_controller.cc | 66 ++++-- src/accl/graph/sega/centeral_controller.hh | 10 +- src/accl/graph/sega/coalesce_engine.cc | 257 ++++++++++++++------- src/accl/graph/sega/coalesce_engine.hh | 17 +- src/accl/graph/sega/enums.cc | 15 +- src/accl/graph/sega/enums.hh | 18 ++ src/accl/graph/sega/mpu.hh | 4 + 11 files changed, 308 insertions(+), 117 deletions(-) diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index 84233ae39c..f09a0dd167 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -43,28 +43,34 @@ struct __attribute__ ((packed)) WorkListItem { uint32_t tempProp : 32; uint32_t prop : 32; - uint32_t degree : 32; uint32_t edgeIndex : 32; + uint32_t degree : 30; + bool activeNow: 1; + bool activeFuture: 1; std::string to_string() { return csprintf("WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, " - "degree: %u}", tempProp, prop, edgeIndex, degree); + "degree: %u, activeNow: %s, activeFuture: %s}", + tempProp, prop, edgeIndex, degree, + activeNow ? "true" : "false", + activeFuture ? "true" : "false"); } WorkListItem(): tempProp(0), prop(0), + edgeIndex(0), degree(0), - edgeIndex(0) + activeNow(false), + activeFuture(false) {} WorkListItem(uint32_t temp_prop, uint32_t prop, - uint32_t degree, uint32_t edge_index): - tempProp(temp_prop), - prop(prop), - degree(degree), - edgeIndex(edge_index) + uint32_t degree, uint32_t edge_index, + bool active_now, bool active_future): + tempProp(temp_prop), prop(prop), edgeIndex(edge_index), degree(degree), + activeNow(active_now), activeFuture(active_future) {} }; @@ -158,6 +164,10 @@ class UniqueFIFO return fifo.size(); } + void clear() { + fifo.clear(); + } + bool empty() { return fifo.empty(); } @@ -174,6 +184,10 @@ class UniqueFIFO assert(it != fifo.end()); fifo.erase(it); } + + void operator=(const UniqueFIFO& rhs) { + fifo = rhs.fifo; + } }; } diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh index de2877d6e8..14a6561ae3 100644 --- a/src/accl/graph/base/graph_workload.hh +++ b/src/accl/graph/base/graph_workload.hh @@ -51,7 +51,7 @@ class GraphWorkload virtual uint32_t reduce(uint32_t update, uint32_t value) = 0; virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0; virtual uint32_t apply(WorkListItem& wl) = 0; - virtual bool activeCondition(WorkListItem wl) = 0; + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0; virtual std::string printWorkListItem(const WorkListItem wl) = 0; }; diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index 09a997696d..8b43c90102 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -42,6 +42,9 @@ class CenteralController(ClockedObject): mpu_vector = VectorParam.MPU("All mpus in the system.") cxx_exports = [ + PyBindMethod("setAsyncMode"), + PyBindMethod("setBSPMode"), + PyBindMethod("createPopCountDirectory"), PyBindMethod("createBFSWorkload"), PyBindMethod("createPRWorkload"), PyBindMethod("printAnswerToHostSimout") diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py index c2393c2f1e..25f8a1c58b 100644 --- a/src/accl/graph/sega/CoalesceEngine.py +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -27,7 +27,6 @@ from m5.params import * from m5.proxy import * -from m5.util.pybind import PyBindMethod from m5.objects.BaseMemoryEngine import BaseMemoryEngine class CoalesceEngine(BaseMemoryEngine): @@ -48,5 +47,3 @@ class CoalesceEngine(BaseMemoryEngine): "apply process for applications that require " "the apply process to happen exactly before " "pushing the edgePointer to the PushEngine.") - - cxx_exports = [PyBindMethod("createPopCountDirectory")] diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 60c78559e4..6c924a4703 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -42,7 +42,9 @@ namespace gem5 CenteralController::CenteralController(const Params& params): ClockedObject(params), - system(params.system) + system(params.system), + mode(ProcessingMode::NOT_SET), + state(BulkSynchronousState::NOT_SET) { for (auto mpu : params.mpu_vector) { mpuVector.push_back(mpu); @@ -50,11 +52,41 @@ CenteralController::CenteralController(const Params& params): } } +void +CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value) +{ + workload = new BFSWorkload(init_addr, init_value); +} + +void +CenteralController::createPRWorkload(float alpha, float threshold) +{ + workload = new PRWorkload(alpha, threshold); +} + +void +CenteralController::createPopCountDirectory(int atoms_per_block) +{ + fatal_if(mode == ProcessingMode::NOT_SET, "You should set the processing " + "mode by calling either setAsyncMode or setBSPMode.") + if (mode == ProcessingMode::ASYNCHRONOUS) { + for (auto mpu: mpuVector) { + mpu->createAsyncPopCountDirectory(atoms_per_block); + } + } + if (mode == ProcessingMode::BULK_SYNCHRONOUS) { + for (auto mpu: mpuVector) { + mpu->createBSPPopCountDirectory(atoms_per_block); + } + } +} + void CenteralController::startup() { for (auto mpu: mpuVector) { addrRangeListMap[mpu] = mpu->getAddrRanges(); + mpu->setProcessingMode(mode); mpu->recvWorkload(workload); } @@ -83,7 +115,7 @@ CenteralController::startup() for (auto mpu: mpuVector) { mpu->postMemInitSetup(); - if (!mpu->running() && (mpu->workCount()> 0)) { + if (!mpu->running() && (mpu->workCount() > 0)) { mpu->start(); } } @@ -104,18 +136,6 @@ CenteralController::createReadPacket(Addr addr, unsigned int size) return pkt; } -void -CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value) -{ - workload = new BFSWorkload(init_addr, init_value); -} - -void -CenteralController::createPRWorkload(float alpha, float threshold) -{ - workload = new PRWorkload(alpha, threshold); -} - void CenteralController::recvDoneSignal() { @@ -124,9 +144,25 @@ CenteralController::recvDoneSignal() done &= mpu->done(); } - if (done) { + if (done && mode == ProcessingMode::ASYNCHRONOUS) { exitSimLoopNow("no update left to process."); } + + if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) { + assert(state != BulkSynchronousState::DONT_CARE); + if (state == BulkSynchronousState::APPLYING) { + // TODO: + // 1- Toggle directories + // 2- Check if termination condition is met + // 3- If yes, schedule exit event, + // 4- If not switch state to consuming. + exitSimLoopNow("applying done."); + } else if (state == BulkSynchronousState::CONSUMING) { + // TODO: + // Schedule Bulk apply + exitSimLoopNow("consuming done."); + } + } } void diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index ae2980d050..ab0e0c0c09 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -33,6 +33,7 @@ #include "accl/graph/base/data_structs.hh" #include "accl/graph/base/graph_workload.hh" +#include "accl/graph/sega/enums.hh" #include "accl/graph/sega/mpu.hh" #include "base/addr_range.hh" #include "params/CenteralController.hh" @@ -46,9 +47,11 @@ class CenteralController : public ClockedObject { private: System* system; - Addr maxVertexAddr; + ProcessingMode mode; + BulkSynchronousState state; + std::vector mpuVector; std::unordered_map addrRangeListMap; @@ -62,6 +65,11 @@ class CenteralController : public ClockedObject CenteralController(const CenteralControllerParams ¶ms); virtual void startup() override; + void setAsyncMode() { mode = ProcessingMode::ASYNCHRONOUS; } + void setBSPMode() { mode = ProcessingMode::BULK_SYNCHRONOUS; } + + void createPopCountDirectory(int atoms_per_block); + void createBFSWorkload(Addr init_addr, uint32_t init_value); void createPRWorkload(float alpha, float threshold); diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 8ac40198be..bfe3fe21b8 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -34,7 +34,6 @@ #include "base/intmath.hh" #include "debug/CacheBlockState.hh" #include "debug/CoalesceEngine.hh" -#include "debug/MSDebug.hh" #include "debug/SEGAStructureSize.hh" #include "mem/packet_access.hh" #include "sim/sim_exit.hh" @@ -43,7 +42,7 @@ namespace gem5 { CoalesceEngine::CoalesceEngine(const Params ¶ms): - BaseMemoryEngine(params), lastAtomAddr(0), + BaseMemoryEngine(params), mode(ProcessingMode::NOT_SET), lastAtomAddr(0), numLines((int) (params.cache_size / peerMemoryAtomSize)), numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle), @@ -77,6 +76,8 @@ CoalesceEngine::registerMPU(MPU* mpu) owner = mpu; } + +// NOTE: Used for initializing memory and reading the final answer void CoalesceEngine::recvFunctional(PacketPtr pkt) { @@ -85,10 +86,6 @@ CoalesceEngine::recvFunctional(PacketPtr pkt) Addr addr = pkt->getAddr(); int block_index = getBlockIndex(addr); - // FIXME: Check postPushWBQueue for hits - // Is it really the case though. I don't think at this time - // beacuse we check done after handleMemResp and make sure all - // the writes to memory are done before scheduling an exit event if ((cacheBlocks[block_index].addr == addr) && (cacheBlocks[block_index].valid)) { assert(cacheBlocks[block_index].state == CacheState::IDLE); @@ -100,7 +97,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt) memPort.sendFunctional(pkt); } } else { - graphWorkload->init(pkt, directory); + graphWorkload->init(pkt, currentDirectory); if (pkt->getAddr() > lastAtomAddr) { lastAtomAddr = pkt->getAddr(); } @@ -111,21 +108,46 @@ CoalesceEngine::recvFunctional(PacketPtr pkt) void CoalesceEngine::postMemInitSetup() { - directory->setLastAtomAddr(lastAtomAddr); + currentDirectory->setLastAtomAddr(lastAtomAddr); } void -CoalesceEngine::createPopCountDirectory(int atoms_per_block) +CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block) { - directory = new PopCountDirectory( + currentDirectory = new PopCountDirectory( peerMemoryRange, atoms_per_block, peerMemoryAtomSize); + futureDirectory = nullptr; +} + +void +CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block) +{ + currentDirectory = new PopCountDirectory( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); + futureDirectory = new PopCountDirectroy( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); +} + +void +CoalesceEngine::swapDirectories() +{ + assert(currentDirectory->empty()); + assert(currentActiveCacheBlocks.empty()); + // assert currentDirectory is empty + WorkDirectory* temp = currentDirectory; + currentDirectory = futureDirectory; + futureDirectory = temp; + + currentActiveCacheBlocks.clear(); + currentActiveCacheBlocks = futureActiveCacheBlocks; + futureActiveCacheBlocks.clear(); } bool CoalesceEngine::done() { - return memoryFunctionQueue.empty() && activeCacheBlocks.empty() && - activeBuffer.empty() && directory->empty() && (onTheFlyReqs == 0); + return memoryFunctionQueue.empty() && currentActiveCacheBlocks.empty() && + activeBuffer.empty() && currentDirectory->empty() && (onTheFlyReqs == 0); } bool @@ -249,16 +271,21 @@ CoalesceEngine::recvWLRead(Addr addr) // NOTE: The cache block could still be active but // not dirty. If active we only have to active tracking // but can throw the data away. - bool atom_active = false; + bool atom_active_now = false; + bool atom_active_future = false; for (int index = 0; index < numElementsPerLine; index++) { - atom_active |= graphWorkload->activeCondition( - cacheBlocks[block_index].items[index]); + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; } - if (atom_active) { - activeCacheBlocks.erase(block_index); - int count = directory->activate(cacheBlocks[block_index].addr); - stats.blockActiveCount.sample(count); - stats.frontierSize.sample(directory->workCount()); + if (atom_active_now) { + currentActiveCacheBlocks.erase(block_index); + int count = currentDirectory->activate(cacheBlocks[block_index].addr); + // stats.blockActiveCount.sample(count); + // stats.frontierSize.sample(directory->workCount()); + } + if (atom_active_future) { + futureActiveCacheBlocks.erase(block_index); + int count = futureDirectory->activate(cacheBlocks[block_index].addr); } // NOTE: Bring the cache line to invalid state. // NOTE: Above line where we set hasConflict to true @@ -360,16 +387,21 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) // Since it is going to the cache, cache will be responsible for // tracking this. Push to activeCacheBlocks for simulator speed // instead of having to search for active blocks in the cache. - bool atom_active = false; + bool atom_active_now = false; + bool atom_active_future = false; for (int index = 0; index < numElementsPerLine; index++) { - atom_active |= graphWorkload->activeCondition( - cacheBlocks[block_index].items[index]); + atom_active_now |= cacheBlocks[block_inde].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_now) { + // TODO: Add sampling of blockActiveCount and frontierSize here + int count = currentDirectory->deactivate(addr); + currentActiveCacheBlocks.push_back(block_index); } - if (atom_active) { - int count = directory->deactivate(addr); - activeCacheBlocks.push_back(block_index); - stats.blockActiveCount.sample(count); - stats.frontierSize.sample(directory->workCount()); + if (atom_active_future) { + // TODO: Add sampling of blockActiveCount and frontierSize here + int count = futureDirectory->deactivate(addr); + futureActiveCacheBlocks.push_back(block_index); } assert(MSHR.find(block_index) != MSHR.end()); @@ -420,15 +452,16 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) WorkListItem items[numElementsPerLine]; pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); - bool atom_active = false; + bool atom_active_now = false; for (int index = 0; index < numElementsPerLine; index++) { - atom_active |= graphWorkload->activeCondition(items[index]); + atom_active |= items[index].activeNow; } - if (atom_active) { - int count = directory->deactivate(addr); + if (atom_active_now) { + // TODO: Add sampling of blockActiveCount and frontierSize here + int count = currentDirectory->deactivate(addr); activeBuffer.emplace_back(pkt, curTick()); - stats.blockActiveCount.sample(count); - stats.frontierSize.sample(directory->workCount()); + // stats.blockActiveCount.sample(count); + // stats.frontierSize.sample(directory->workCount()); } else { delete pkt; } @@ -486,6 +519,9 @@ CoalesceEngine::processNextResponseEvent() stats.responseQueueLatency.sample( waiting_ticks * 1e9 / getClockFrequency()); if (num_responses_sent >= maxRespPerCycle) { + // TODO: Add the condition to check that front of queue can be + // sent to WLEngine. i.e. it has at least been in the queue for + // one cycle. if (!responseQueue.empty()) { stats.responsePortShortage++; } @@ -533,12 +569,22 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) { cacheBlocks[block_index].dirty |= true; } + + bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]); cacheBlocks[block_index].items[wl_offset] = wl; - if ((graphWorkload->activeCondition(cacheBlocks[block_index].items[wl_offset])) && - (!activeCacheBlocks.find(block_index))) { - activeCacheBlocks.push_back(block_index); - if (!owner->running()) { - owner->start(); + if (mode == ProcessingMode::ASYNCHRONOUS) { + cacheBlocks[block_index].activeNow |= active; + if (active && (!currentActiveCacheBlocks.find(block_index))) { + currentActiveCacheBlocks.push_back(block_index); + if (!owner->running()) { + owner->start(); + } + } + } + if (mode == ProcessingMode::BULK_SYNCHRONOUS) { + cacheBlocks[block_index].activeFuture |= active; + if (active && (!futureActiveCacheBlocks.find(block_index))) { + futureActiveCacheBlocks.push_back(block_index); } } @@ -565,16 +611,22 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) schedule(nextMemoryEvent, nextCycle()); } } else { - bool atom_active = false; + bool atom_active_now = false; + bool atom_active_future = false; for (int index = 0; index < numElementsPerLine; index++) { - atom_active |= graphWorkload->activeCondition( - cacheBlocks[block_index].items[index]); + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; } - if (atom_active) { - activeCacheBlocks.erase(block_index); - int count = directory->activate(cacheBlocks[block_index].addr); - stats.blockActiveCount.sample(count); - stats.frontierSize.sample(directory->workCount()); + if (atom_active_now) { + // TODO: Sample frontier size and blockCount here. + currentActiveCacheBlocks.erase(block_index); + int count = currentDirectory->activate(cacheBlocks[block_index].addr); + // stats.blockActiveCount.sample(count); + // stats.frontierSize.sample(directory->workCount()); + } + if (atom_active_future) { + futureActiveCacheBlocks.erase(block_index); + int count = futureDirectory->activate(cacheBlocks[block_index].addr); } cacheBlocks[block_index].reset(); } @@ -586,6 +638,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, block_index, cacheBlocks[block_index].to_string()); stats.numVertexWrites++; + if ((cacheBlocks[block_index].state == CacheState::IDLE) && done()) { owner->recvDoneSignal(); } @@ -623,6 +676,8 @@ CoalesceEngine::processNextMemoryEvent() schedule(nextMemoryEvent, nextCycle()); } + // FIXME: done() might have a different meaning depending on + // ProcessingMode and Processing state if (done()) { owner->recvDoneSignal(); } @@ -659,6 +714,16 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) cacheBlocks[block_index].valid = true; cacheBlocks[block_index].dirty = true; cacheBlocks[block_index].lastChangedTick = curTick(); + // NOTE: If an atom is in the postPushWBQueue, + // the it is definitely currently not active. + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) + { + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_future) { + futureActiveCacheBlocks.push_back(block_index); + } need_send_pkt = false; wb = postPushWBQueue.erase(wb); @@ -677,7 +742,19 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) cacheBlocks[block_index].valid = true; cacheBlocks[block_index].dirty = true; cacheBlocks[block_index].lastChangedTick = curTick(); - activeCacheBlocks.push_back(block_index); + // If an atom is in the activeBuffer, + // then it is definitely currently active. + currentActiveCacheBlocks.push_back(block_index); + // NOTE: Residence in the activeBuffer does not + // signify anything about future activity. + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) + { + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_future) { + futureActiveCacheBlocks.push_back(block_index); + } need_send_pkt = false; ab = activeBuffer.erase(ab); @@ -767,10 +844,11 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) // NOTE: If the atom we're writing back is active, we have to // stop tracking it in the cache and start tracking it in the memory. - bool atom_active = false; + bool atom_active_now = false; + bool atom_active_future = false; for (int index = 0; index < numElementsPerLine; index++) { - atom_active |= graphWorkload->activeCondition( - cacheBlocks[block_index].items[index]); + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; } PacketPtr pkt = createWritePacket( @@ -779,18 +857,25 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) DPRINTF(CoalesceEngine, "%s: Created a write packet to " "Addr: %lu, size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); - if (atom_active) { - activeCacheBlocks.erase(block_index); + if (atom_active_future) { + futureActiveCacheBlocks.erase(block_index); + } + if (atom_active_now) { + currentActiveCacheBlocks.erase(block_index); if (enoughSpace()) { activeBuffer.emplace_back(pkt, curTick()); } else { - int count = directory->activate(cacheBlocks[block_index].addr); - stats.blockActiveCount.sample(count); - stats.frontierSize.sample(directory->workCount()); + int count = currentDirectory->activate(cacheBlocks[block_index].addr); + int count_2 = futureDirectory->activate(cacheBlocks[block_index].addr); + // stats.blockActiveCount.sample(count); + // stats.frontierSize.sample(directory->workCount()); memPort.sendPacket(pkt); onTheFlyReqs++; } } else { + if (atom_active_future) { + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + } memPort.sendPacket(pkt); onTheFlyReqs++; } @@ -810,17 +895,24 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) void CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick) { - if (postPushWBQueue.empty()) { - return; - } - - PacketPtr wb_pkt; - Tick pkt_tick; - std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front(); - if (schedule_tick == pkt_tick) { - memPort.sendPacket(wb_pkt); - onTheFlyReqs++; - postPushWBQueue.pop_front(); + if (!postPushWBQueue.empty()) { + PacketPtr wb_pkt; + Tick pkt_tick; + std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front(); + if (schedule_tick == pkt_tick) { + WorkListItem items[numElementsPerLine]; + wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + bool atom_active_future = false; + for (int index = 0; index < numElementPerLine; index++) { + atom_active_future |= items[index].activeFuture; + } + if (atom_active_future) { + futureDirectory->activate(wb_pkt->getAddr()); + } + memPort.sendPacket(wb_pkt); + onTheFlyReqs++; + postPushWBQueue.pop_front(); + } } } @@ -828,8 +920,8 @@ void CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) { pullsScheduled--; - if (!directory->empty()) { - Addr addr = directory->getNextWork(); + if (!currentDirectory->empty()) { + Addr addr = currentDirectory->getNextWork(); int block_index = getBlockIndex(addr); bool in_cache = cacheBlocks[block_index].addr == addr; @@ -875,8 +967,7 @@ CoalesceEngine::recvMemRetry() int CoalesceEngine::workCount() { - return activeCacheBlocks.size() + - directory->workCount() + activeBuffer.size(); + return activeCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size(); } void @@ -905,9 +996,10 @@ CoalesceEngine::processNextApplyEvent() pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) { - if (graphWorkload->activeCondition(items[index])) { + if (items[index].activeNow) { Addr addr = pkt->getAddr() + index * sizeof(WorkListItem); uint32_t delta = graphWorkload->apply(items[index]); + items[index].activeNow = false; owner->recvVertexPush(addr, delta, items[index].edgeIndex, items[index].degree); pullsReceived--; @@ -919,12 +1011,12 @@ CoalesceEngine::processNextApplyEvent() pkt->allocate(); pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize); - bool atom_active = false; + bool atom_active_now = false; for (int index = 0; index < numElementsPerLine; index++) { - atom_active |= graphWorkload->activeCondition(items[index]); + atom_active_now |= items[index].activeNow; } // NOTE: If the atom is not active anymore. - if (!atom_active) { + if (!atom_active_now) { PacketPtr wb_pkt = createWritePacket(pkt->getAddr(), peerMemoryAtomSize, (uint8_t*) items); postPushWBQueue.emplace_back(wb_pkt, curTick()); @@ -946,9 +1038,10 @@ CoalesceEngine::processNextApplyEvent() int block_index = activeCacheBlocks.front(); if (cacheBlocks[block_index].state == CacheState::IDLE) { for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) { - if (graphWorkload->activeCondition(cacheBlocks[block_index].items[index])) { + if (cacheBlocks[block_index].items[index].activeNow) { Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem); uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]); + cacheBlocks[block_index].items[index].activeNow = false; cacheBlocks[block_index].dirty = true; owner->recvVertexPush(addr, delta, cacheBlocks[block_index].items[index].edgeIndex, @@ -959,20 +1052,20 @@ CoalesceEngine::processNextApplyEvent() } } - bool atom_active = false; + bool atom_active_now = false; for (int index = 0; index < numElementsPerLine; index++) { - atom_active |= graphWorkload->activeCondition(cacheBlocks[block_index].items[index]); + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; } // NOTE: If we have reached the last item in the cache block - if (!atom_active) { - activeCacheBlocks.erase(block_index); + if (!atom_active_now) { + currentActiveCacheBlocks.erase(block_index); } break; } // NOTE: If the block with index at the front of activeCacheBlocks // is not in IDLE state, then roll the that index to the back - activeCacheBlocks.pop_front(); - activeCacheBlocks.push_back(block_index); + currentActiveCacheBlocks.pop_front(); + currentActiveCacheBlocks.push_back(block_index); // NOTE: If we have visited all the items initially in the FIFO. num_visited_indices++; if (num_visited_indices == initial_fifo_length) { diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index f605704b6d..39f2491232 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -96,7 +96,9 @@ class CoalesceEngine : public BaseMemoryEngine }; MPU* owner; - WorkDirectory* directory; + ProcessingMode mode; + WorkDirectory* currentDirectory; + WorkDirectory* futureDirectory; GraphWorkload* graphWorkload; Addr lastAtomAddr; @@ -114,8 +116,9 @@ class CoalesceEngine : public BaseMemoryEngine // Tracking work in cache int pullsReceived; - // NOTE: Remember to erase from this upon eviction from cache - UniqueFIFO activeCacheBlocks; + // NOTE: Remember to erase from these upon eviction from cache + UniqueFIFO currentActiveCacheBlocks; + UniqueFIFO futureActiveCacheBlocks; int pullsScheduled; int pendingPullLimit; @@ -195,12 +198,14 @@ class CoalesceEngine : public BaseMemoryEngine CoalesceEngine(const Params ¶ms); void registerMPU(MPU* mpu); + void setProcessingMode(ProcessingMode _mode) { mode = _mode; } + void createAsyncPopCountDirectory(int atoms_per_block); + void createBSPPopCountDirectory(int atoms_per_block); void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; } - virtual void recvFunctional(PacketPtr pkt); + virtual void recvFunctional(PacketPtr pkt); void postMemInitSetup(); - - void createPopCountDirectory(int atoms_per_block); + void swapDirectories(); ReadReturnStatus recvWLRead(Addr addr); void recvWLWrite(Addr addr, WorkListItem wl); diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc index de5d569c18..83f3033427 100644 --- a/src/accl/graph/sega/enums.cc +++ b/src/accl/graph/sega/enums.cc @@ -39,7 +39,6 @@ const char* cacheStateStrings[NUM_CACHE_STATE] = { "PENDING_WB" }; - const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS] = { "ACCEPT", @@ -53,4 +52,18 @@ const char* readDestinationStrings[NUM_READ_DESTINATION] = "READ_FOR_PUSH" }; +const char* processingModeStrings[NUM_PROCESSING_MODE] = +{ + "NOT_SET", + "ASYNCHRONOUS", + "BULK_SYNCHRONOUS" +}; + +const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE] = +{ + "NOT_SET", + "CONSUMING", + "APPLYING" +}; + } // namespace gem5 diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh index 6153386b71..f6d199bf7d 100644 --- a/src/accl/graph/sega/enums.hh +++ b/src/accl/graph/sega/enums.hh @@ -60,6 +60,24 @@ enum ReadDestination }; extern const char* readDestinationStrings[NUM_READ_DESTINATION]; +enum ProcessingMode +{ + NOT_SET, + ASYNCHRONOUS, + BULK_SYNCHRONOUS, + NUM_PROCESSING_MODE +}; +extern const char* processingModeStrings[NUM_PROCESSING_MODE]; + +enum BulkSynchronousStates +{ + NOT_SET, + CONSUMING, + APPLYING, + NUM_BULK_SYNCHRONOUS_STATE, +} +extern const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE]; + } // namespace gem5 #endif // __ACCL_GRAPH_SEGA_ENUMS_HH__ diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index ad18a0d5a5..358394ffc5 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -63,6 +63,10 @@ class MPU : public SimObject MPU(const Params& params); void registerCenteralController(CenteralController* centeral_controller); + void setProcessingMode(ProcessingMode mode) { coalesceEngine->setProcessingMode(mode); } + void createAsyncPopCountDirectory(int atoms_per_block) { coalseceEngine->createAsyncPopCountDirectory(atoms_per_block); } + void createBSPPopCountDirectory(int atoms_per_block) { coalseceEngine->createBSPPopCountDirectory(atoms_per_block); } + AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); } void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); } void postMemInitSetup() { coalesceEngine->postMemInitSetup(); } From 454e1e3a81c2818ea532183335fd94e731899326 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 12 Nov 2022 13:12:57 -0800 Subject: [PATCH 224/247] Fixing enums --- src/accl/graph/sega/centeral_controller.cc | 5 ++++- src/accl/graph/sega/enums.hh | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 6c924a4703..6e5f3ffcec 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -113,6 +113,9 @@ CenteralController::startup() panic_if(!image.write(proxy), "%s: Unable to write image."); + if (mode == ProcessingMode::BULK_SYNCHRONOUS) { + state = BulkSynchronousStates::CONSUMING; + } for (auto mpu: mpuVector) { mpu->postMemInitSetup(); if (!mpu->running() && (mpu->workCount() > 0)) { @@ -149,7 +152,7 @@ CenteralController::recvDoneSignal() } if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) { - assert(state != BulkSynchronousState::DONT_CARE); + assert(state != BulkSynchronousState::NOT_SET); if (state == BulkSynchronousState::APPLYING) { // TODO: // 1- Toggle directories diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh index f6d199bf7d..8280f122c3 100644 --- a/src/accl/graph/sega/enums.hh +++ b/src/accl/graph/sega/enums.hh @@ -75,7 +75,7 @@ enum BulkSynchronousStates CONSUMING, APPLYING, NUM_BULK_SYNCHRONOUS_STATE, -} +}; extern const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE]; } // namespace gem5 From f4b8685a29d80717374c2d222bfc96e5cec25266 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 12 Nov 2022 13:15:52 -0800 Subject: [PATCH 225/247] Further fixes for enums. --- src/accl/graph/sega/centeral_controller.cc | 4 ++-- src/accl/graph/sega/enums.cc | 2 +- src/accl/graph/sega/enums.hh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 6e5f3ffcec..c6b9cf7a52 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -44,7 +44,7 @@ CenteralController::CenteralController(const Params& params): ClockedObject(params), system(params.system), mode(ProcessingMode::NOT_SET), - state(BulkSynchronousState::NOT_SET) + state(BulkSynchronousState::DONT_CARE) { for (auto mpu : params.mpu_vector) { mpuVector.push_back(mpu); @@ -152,7 +152,7 @@ CenteralController::recvDoneSignal() } if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) { - assert(state != BulkSynchronousState::NOT_SET); + assert(state != BulkSynchronousState::DONT_CARE); if (state == BulkSynchronousState::APPLYING) { // TODO: // 1- Toggle directories diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc index 83f3033427..099594e9eb 100644 --- a/src/accl/graph/sega/enums.cc +++ b/src/accl/graph/sega/enums.cc @@ -61,7 +61,7 @@ const char* processingModeStrings[NUM_PROCESSING_MODE] = const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE] = { - "NOT_SET", + "DONT_CARE", "CONSUMING", "APPLYING" }; diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh index 8280f122c3..4c94412c9b 100644 --- a/src/accl/graph/sega/enums.hh +++ b/src/accl/graph/sega/enums.hh @@ -71,7 +71,7 @@ extern const char* processingModeStrings[NUM_PROCESSING_MODE]; enum BulkSynchronousStates { - NOT_SET, + DONT_CARE, CONSUMING, APPLYING, NUM_BULK_SYNCHRONOUS_STATE, From c3fd13291d5a4ecf5e43713888a4de11769b05a4 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 12 Nov 2022 13:21:12 -0800 Subject: [PATCH 226/247] Fixing typos --- src/accl/graph/sega/enums.hh | 2 +- src/accl/graph/sega/mpu.hh | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh index 4c94412c9b..969ee8a976 100644 --- a/src/accl/graph/sega/enums.hh +++ b/src/accl/graph/sega/enums.hh @@ -69,7 +69,7 @@ enum ProcessingMode }; extern const char* processingModeStrings[NUM_PROCESSING_MODE]; -enum BulkSynchronousStates +enum BulkSynchronousState { DONT_CARE, CONSUMING, diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index 358394ffc5..7d75e3e0b7 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -64,8 +64,8 @@ class MPU : public SimObject void registerCenteralController(CenteralController* centeral_controller); void setProcessingMode(ProcessingMode mode) { coalesceEngine->setProcessingMode(mode); } - void createAsyncPopCountDirectory(int atoms_per_block) { coalseceEngine->createAsyncPopCountDirectory(atoms_per_block); } - void createBSPPopCountDirectory(int atoms_per_block) { coalseceEngine->createBSPPopCountDirectory(atoms_per_block); } + void createAsyncPopCountDirectory(int atoms_per_block) { coalesceEngine->createAsyncPopCountDirectory(atoms_per_block); } + void createBSPPopCountDirectory(int atoms_per_block) { coalesceEngine->createBSPPopCountDirectory(atoms_per_block); } AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); } void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); } From 513e3f6beb77eb97902f9c0eafd5791b4dc9dcff Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 12 Nov 2022 13:31:55 -0800 Subject: [PATCH 227/247] Fixing typos. --- src/accl/graph/sega/centeral_controller.cc | 2 +- src/accl/graph/sega/coalesce_engine.cc | 21 +++++++++++++-------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index c6b9cf7a52..df1abbedc3 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -114,7 +114,7 @@ CenteralController::startup() panic_if(!image.write(proxy), "%s: Unable to write image."); if (mode == ProcessingMode::BULK_SYNCHRONOUS) { - state = BulkSynchronousStates::CONSUMING; + state = BulkSynchronousState::CONSUMING; } for (auto mpu: mpuVector) { mpu->postMemInitSetup(); diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index bfe3fe21b8..6efafbb76c 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -124,7 +124,7 @@ CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block) { currentDirectory = new PopCountDirectory( peerMemoryRange, atoms_per_block, peerMemoryAtomSize); - futureDirectory = new PopCountDirectroy( + futureDirectory = new PopCountDirectory( peerMemoryRange, atoms_per_block, peerMemoryAtomSize); } @@ -390,7 +390,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) bool atom_active_now = false; bool atom_active_future = false; for (int index = 0; index < numElementsPerLine; index++) { - atom_active_now |= cacheBlocks[block_inde].items[index].activeNow; + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; } if (atom_active_now) { @@ -453,12 +453,17 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) WorkListItem items[numElementsPerLine]; pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); bool atom_active_now = false; + bool atom_active_future = false; for (int index = 0; index < numElementsPerLine; index++) { - atom_active |= items[index].activeNow; + atom_active_now |= items[index].activeNow; + atom_active_future |= items[index].activeFuture; } if (atom_active_now) { // TODO: Add sampling of blockActiveCount and frontierSize here int count = currentDirectory->deactivate(addr); + if (atom_active_future) { + int count_2 = futureDirectory->deactivate(addr); + } activeBuffer.emplace_back(pkt, curTick()); // stats.blockActiveCount.sample(count); // stats.frontierSize.sample(directory->workCount()); @@ -573,7 +578,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]); cacheBlocks[block_index].items[wl_offset] = wl; if (mode == ProcessingMode::ASYNCHRONOUS) { - cacheBlocks[block_index].activeNow |= active; + cacheBlocks[block_index].items[wl_offset].activeNow |= active; if (active && (!currentActiveCacheBlocks.find(block_index))) { currentActiveCacheBlocks.push_back(block_index); if (!owner->running()) { @@ -582,7 +587,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) } } if (mode == ProcessingMode::BULK_SYNCHRONOUS) { - cacheBlocks[block_index].activeFuture |= active; + cacheBlocks[block_index].items[wl_offset].activeFuture |= active; if (active && (!futureActiveCacheBlocks.find(block_index))) { futureActiveCacheBlocks.push_back(block_index); } @@ -903,7 +908,7 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick) WorkListItem items[numElementsPerLine]; wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); bool atom_active_future = false; - for (int index = 0; index < numElementPerLine; index++) { + for (int index = 0; index < numElementsPerLine; index++) { atom_active_future |= items[index].activeFuture; } if (atom_active_future) { @@ -967,7 +972,7 @@ CoalesceEngine::recvMemRetry() int CoalesceEngine::workCount() { - return activeCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size(); + return currentActiveCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size(); } void @@ -1031,7 +1036,7 @@ CoalesceEngine::processNextApplyEvent() } delete pkt; } - } else if (!activeCacheBlocks.empty()) { + } else if (!currentActiveCacheBlocks.empty()) { int num_visited_indices = 0; int initial_fifo_length = activeCacheBlocks.size(); while (true) { From d9ae6bed35e40240d7f6c80eb4c37b816099885d Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 12 Nov 2022 13:41:42 -0800 Subject: [PATCH 228/247] Fixing typos. --- src/accl/graph/sega/centeral_controller.cc | 2 +- src/accl/graph/sega/coalesce_engine.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index df1abbedc3..db0f7941ed 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -68,7 +68,7 @@ void CenteralController::createPopCountDirectory(int atoms_per_block) { fatal_if(mode == ProcessingMode::NOT_SET, "You should set the processing " - "mode by calling either setAsyncMode or setBSPMode.") + "mode by calling either setAsyncMode or setBSPMode."); if (mode == ProcessingMode::ASYNCHRONOUS) { for (auto mpu: mpuVector) { mpu->createAsyncPopCountDirectory(atoms_per_block); diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 6efafbb76c..e3c194566a 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -1038,9 +1038,9 @@ CoalesceEngine::processNextApplyEvent() } } else if (!currentActiveCacheBlocks.empty()) { int num_visited_indices = 0; - int initial_fifo_length = activeCacheBlocks.size(); + int initial_fifo_length = crrentActiveCacheBlocks.size(); while (true) { - int block_index = activeCacheBlocks.front(); + int block_index = currentActiveCacheBlocks.front(); if (cacheBlocks[block_index].state == CacheState::IDLE) { for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) { if (cacheBlocks[block_index].items[index].activeNow) { From 37ec3ddacd9e25127f5ee90a7341956549bc731d Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 12 Nov 2022 13:54:47 -0800 Subject: [PATCH 229/247] Debug. --- src/accl/graph/base/graph_workload.cc | 74 +++++++++++++++++++++- src/accl/graph/base/graph_workload.hh | 36 +++++------ src/accl/graph/sega/centeral_controller.cc | 10 +-- src/accl/graph/sega/centeral_controller.hh | 2 +- src/accl/graph/sega/coalesce_engine.cc | 2 +- 5 files changed, 97 insertions(+), 27 deletions(-) diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index e36c074da9..a78b3c1526 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -92,9 +92,9 @@ BFSWorkload::propagate(uint32_t value, uint32_t weight) } bool -BFSWorkload::activeCondition(WorkListItem wl) +BFSWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) { - return (wl.tempProp < wl.prop) && (wl.degree > 0); + return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree; > 0); } uint32_t @@ -298,6 +298,76 @@ PRWorkload::printWorkListItem(const WorkListItem wl) "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}", temp_float, prop_float, wl.degree, wl.edgeIndex); } +// void +// PRWorkload::init(PacketPtr pkt, WorkDirectory* dir) +// { +// size_t pkt_size = pkt->getSize(); +// int num_elements = (int) (pkt_size / sizeof(WorkListItem)); +// WorkListItem items[num_elements]; + +// pkt->writeDataToBlock((uint8_t*) items, pkt_size); +// bool atom_active = false; +// for (int i = 0; i < num_elements; i++) { +// items[i].tempProp = readFromFloat(0); +// items[i].prop = readFromFloat(1 - alpha); +// atom_active |= activeCondition(items[i]); +// } +// if (atom_active) { +// dir->activate(pkt->getAddr()); +// } +// pkt->deleteData(); +// pkt->allocate(); +// pkt->setDataFromBlock((uint8_t*) items, pkt_size); +// } + +// uint32_t +// PRWorkload::reduce(uint32_t update, uint32_t value) +// { +// float update_float = writeToFloat(update); +// float value_float = writeToFloat(value); +// return readFromFloat(update_float + value_float); +// } + +// uint32_t +// PRWorkload::propagate(uint32_t value, uint32_t weight) +// { +// float value_float = writeToFloat(value); +// float weight_float = writeToFloat(weight); +// if (weight == 0) { +// weight_float = 1.0; +// } +// return readFromFloat(alpha * value_float * weight_float); +// } + +// bool +// PRWorkload::activeCondition(WorkListItem wl) +// { +// float temp_float = writeToFloat(wl.tempProp); +// float prop_float = writeToFloat(wl.prop); +// float dist = std::abs(temp_float - prop_float); +// return (dist >= threshold) && (wl.degree > 0); +// } + +// uint32_t +// PRWorkload::apply(WorkListItem& wl) +// { +// float temp_float = writeToFloat(wl.tempProp); +// float prop_float = writeToFloat(wl.prop); +// float delta = (temp_float - prop_float) / wl.degree; +// uint32_t delta_uint = readFromFloat(delta); +// wl.prop = wl.tempProp; +// return delta_uint; +// } + +// std::string +// PRWorkload::printWorkListItem(const WorkListItem wl) +// { +// float temp_float = writeToFloat(wl.tempProp); +// float prop_float = writeToFloat(wl.prop); +// return csprintf( +// "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}", +// temp_float, prop_float, wl.degree, wl.edgeIndex); +// } void CCWorkload::init(PacketPtr pkt, WorkDirectory* dir) diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh index 14a6561ae3..8e27d16bf9 100644 --- a/src/accl/graph/base/graph_workload.hh +++ b/src/accl/graph/base/graph_workload.hh @@ -72,7 +72,7 @@ class BFSWorkload : public GraphWorkload virtual uint32_t reduce(uint32_t update, uint32_t value); virtual uint32_t propagate(uint32_t value, uint32_t weight); virtual uint32_t apply(WorkListItem& wl); - virtual bool activeCondition(WorkListItem wl); + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); virtual std::string printWorkListItem(const WorkListItem wl); }; @@ -119,26 +119,26 @@ class SSSPWorkload : public GraphWorkload }; -class PRWorkload : public GraphWorkload -{ - private: - float alpha; - float threshold; +// class PRWorkload : public GraphWorkload +// { +// private: +// float alpha; +// float threshold; - public: - PRWorkload(float alpha, float threshold): - alpha(alpha), threshold(threshold) - {} +// public: +// PRWorkload(float alpha, float threshold): +// alpha(alpha), threshold(threshold) +// {} - ~PRWorkload() {} +// ~PRWorkload() {} - virtual void init(PacketPtr pkt, WorkDirectory* dir); - virtual uint32_t reduce(uint32_t update, uint32_t value); - virtual uint32_t propagate(uint32_t value, uint32_t weight); - virtual uint32_t apply(WorkListItem& wl); - virtual bool activeCondition(WorkListItem wl); - virtual std::string printWorkListItem(const WorkListItem wl); -}; +// virtual void init(PacketPtr pkt, WorkDirectory* dir); +// virtual uint32_t reduce(uint32_t update, uint32_t value); +// virtual uint32_t propagate(uint32_t value, uint32_t weight); +// virtual uint32_t apply(WorkListItem& wl); +// virtual bool activeCondition(WorkListItem wl); +// virtual std::string printWorkListItem(const WorkListItem wl); +// }; class CCWorkload : public GraphWorkload { diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index db0f7941ed..7de6f61b56 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -58,11 +58,11 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value) workload = new BFSWorkload(init_addr, init_value); } -void -CenteralController::createPRWorkload(float alpha, float threshold) -{ - workload = new PRWorkload(alpha, threshold); -} +// void +// CenteralController::createPRWorkload(float alpha, float threshold) +// { +// workload = new PRWorkload(alpha, threshold); +// } void CenteralController::createPopCountDirectory(int atoms_per_block) diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index ab0e0c0c09..b32dc38385 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -71,7 +71,7 @@ class CenteralController : public ClockedObject void createPopCountDirectory(int atoms_per_block); void createBFSWorkload(Addr init_addr, uint32_t init_value); - void createPRWorkload(float alpha, float threshold); + // void createPRWorkload(float alpha, float threshold); void recvDoneSignal(); diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index e3c194566a..6b44f7395b 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -1038,7 +1038,7 @@ CoalesceEngine::processNextApplyEvent() } } else if (!currentActiveCacheBlocks.empty()) { int num_visited_indices = 0; - int initial_fifo_length = crrentActiveCacheBlocks.size(); + int initial_fifo_length = currentActiveCacheBlocks.size(); while (true) { int block_index = currentActiveCacheBlocks.front(); if (cacheBlocks[block_index].state == CacheState::IDLE) { From 4abd1cd5ec0e131cd56a741395e7ffe1bcdb2dd0 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 12 Nov 2022 14:06:44 -0800 Subject: [PATCH 230/247] Debugging. --- src/accl/graph/base/graph_workload.cc | 8 +++++--- src/accl/graph/sega/CenteralController.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index a78b3c1526..50024965a1 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -67,12 +67,14 @@ BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir) WorkListItem items[num_elements]; pkt->writeDataToBlock((uint8_t*) items, pkt_size); - int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem)); - items[index].tempProp = initValue; - if (activeCondition(items[index])) { + WorkListItem new_wl = items[index]; + new_wl.tempProp = initValue; + if (activeCondition(new_wl, items[index])) { dir->activate(aligned_addr); } + items[index] = new_wl; + pkt->deleteData(); pkt->allocate(); pkt->setDataFromBlock((uint8_t*) items, pkt_size); diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index 8b43c90102..6de9e03a1c 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -46,6 +46,6 @@ class CenteralController(ClockedObject): PyBindMethod("setBSPMode"), PyBindMethod("createPopCountDirectory"), PyBindMethod("createBFSWorkload"), - PyBindMethod("createPRWorkload"), + # PyBindMethod("createPRWorkload"), PyBindMethod("printAnswerToHostSimout") ] From 32a0f813e93accd59bd0f8d70430d9d5972d6317 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 12 Nov 2022 14:08:17 -0800 Subject: [PATCH 231/247] Typos. --- src/accl/graph/base/graph_workload.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 50024965a1..9c21a3932a 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -96,7 +96,7 @@ BFSWorkload::propagate(uint32_t value, uint32_t weight) bool BFSWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) { - return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree; > 0); + return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree > 0); } uint32_t From 1352e207854c3f38670358efa991967ecb0a3089 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sat, 12 Nov 2022 14:14:41 -0800 Subject: [PATCH 232/247] Debugging. --- src/accl/graph/base/graph_workload.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 9c21a3932a..8536c2bbd8 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -71,6 +71,7 @@ BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir) WorkListItem new_wl = items[index]; new_wl.tempProp = initValue; if (activeCondition(new_wl, items[index])) { + new_wl.activeNow = true; dir->activate(aligned_addr); } items[index] = new_wl; From f13057c8ad23d2c91203cf2ac151ce3cd54f4169 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 13 Nov 2022 00:58:54 -0800 Subject: [PATCH 233/247] Finalizing bsp and pr. --- configs/accl/bfs.py | 3 +- configs/accl/pr.py | 28 +++-- configs/accl/sega.py | 22 ++-- configs/accl/sega_simple.py | 21 ++-- src/accl/graph/base/graph_workload.cc | 131 ++++++--------------- src/accl/graph/base/graph_workload.hh | 34 +++--- src/accl/graph/sega/CenteralController.py | 3 +- src/accl/graph/sega/centeral_controller.cc | 46 ++++---- src/accl/graph/sega/centeral_controller.hh | 4 +- src/accl/graph/sega/coalesce_engine.cc | 63 ++++++++++ src/accl/graph/sega/coalesce_engine.hh | 2 + src/accl/graph/sega/enums.cc | 7 -- src/accl/graph/sega/enums.hh | 9 -- src/accl/graph/sega/mpu.hh | 2 + 14 files changed, 193 insertions(+), 182 deletions(-) diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py index 806aa8a915..ab5de485b1 100644 --- a/configs/accl/bfs.py +++ b/configs/accl/bfs.py @@ -88,7 +88,7 @@ def get_inputs(): sample, verify, ) = get_inputs() - + if simple: from sega_simple import SEGA else: @@ -98,6 +98,7 @@ def get_inputs(): m5.instantiate() + system.set_async_mode() system.create_pop_count_directory(64) system.create_bfs_workload(init_addr, init_value) if sample: diff --git a/configs/accl/pr.py b/configs/accl/pr.py index e3d7c764ad..ea8a103640 100644 --- a/configs/accl/pr.py +++ b/configs/accl/pr.py @@ -35,9 +35,9 @@ def get_inputs(): argparser = argparse.ArgumentParser() argparser.add_argument("num_gpts", type=int) argparser.add_argument("cache_size", type=str) + argparser.add_argument("iterations", type=int) argparser.add_argument("graph", type=str) argparser.add_argument("alpha", type=float) - argparser.add_argument("threshold", type=float) argparser.add_argument( "--simple", dest="simple", @@ -69,8 +69,8 @@ def get_inputs(): args.num_gpts, args.cache_size, args.graph, + args.iterations, args.alpha, - args.threshold, args.simple, args.sample, args.verify, @@ -82,13 +82,13 @@ def get_inputs(): num_gpts, cache_size, graph, + iterations, alpha, - threshold, simple, sample, verify, ) = get_inputs() - + if simple: from sega_simple import SEGA else: @@ -98,8 +98,9 @@ def get_inputs(): m5.instantiate() + system.set_bsp_mode() system.create_pop_count_directory(64) - system.create_pr_workload(alpha, threshold) + system.create_pr_workload(alpha) if sample: while True: exit_event = m5.simulate(100000000) @@ -112,11 +113,16 @@ def get_inputs(): if exit_event.getCause() != "simulate() limit reached": break else: - exit_event = m5.simulate() - print( - f"Exited simulation at tick {m5.curTick()} " - + f"because {exit_event.getCause()}" - ) + iteration = 0 + while iteration < iterations: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + iteration += 1 + if system.work_count() == 0: + break + print(f"#iterations: {iteration}") if verify: system.print_answer() - diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 1ea36ea49e..07e1b36d9d 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -73,8 +73,8 @@ def __init__( ) self.edge_mem_ctrl = MemCtrl( - dram=DDR4_2400_8x8( - range=AddrRange(edge_memory_size), in_addr_map=False) + dram= + DDR4_2400_8x8(range=AddrRange(edge_memory_size), in_addr_map=False) ) self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port self.push_engine.mem_port = self.edge_mem_ctrl.port @@ -124,7 +124,7 @@ def __init__(self, num_mpus, cache_size, graph_path): gpts = [] for i in range(num_mpus): - gpt = GPT("2GiB", cache_size) + gpt = GPT("16GiB", cache_size) gpt.set_vertex_range( [vertex_ranges[i], vertex_ranges[i + num_mpus]] ) @@ -139,15 +139,23 @@ def __init__(self, num_mpus, cache_size, graph_path): self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] + def work_count(self): + return self.ctrl.workCount() + + def set_async_mode(self): + self.ctrl.setAsyncMode() + + def set_bsp_mode(self): + self.ctrl.setBSPMode() + def create_pop_count_directory(self, atoms_per_block): - for gpt in self.gpts: - gpt.coalesce_engine.createPopCountDirectory(atoms_per_block) + self.ctrl.createPopCountDirectory(atoms_per_block) def create_bfs_workload(self, init_addr, init_value): self.ctrl.createBFSWorkload(init_addr, init_value) - def create_pr_workload(self, alpha, threshold): - self.ctrl.createPRWorkload(alpha, threshold) + def create_pr_workload(self, alpha): + self.ctrl.createPRWorkload(alpha) def print_answer(self): self.ctrl.printAnswerToHostSimout() diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py index f59fa71a79..8727a4c90d 100644 --- a/configs/accl/sega_simple.py +++ b/configs/accl/sega_simple.py @@ -66,9 +66,9 @@ def __init__( max_propagates_per_cycle=8, update_queue_size=32, ) - + self.vertex_mem_ctrl = SimpleMemory(latency="122ns", latency_var="0ns", bandwidth="28GiB/s") - + self.edge_mem_ctrl = MemCtrl( dram=DDR4_2400_8x8( range=AddrRange(edge_memory_size), in_addr_map=False) @@ -129,16 +129,23 @@ def __init__(self, num_mpus, cache_size, graph_path): self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] + def work_count(self): + return self.ctrl.workCount() + + def set_async_mode(self): + self.ctrl.setAsyncMode() + + def set_bsp_mode(self): + self.ctrl.setBSPMode() + def create_pop_count_directory(self, atoms_per_block): - for gpt in self.gpts: - gpt.coalesce_engine.createPopCountDirectory(atoms_per_block) + self.ctrl.createPopCountDirectory(atoms_per_block) def create_bfs_workload(self, init_addr, init_value): self.ctrl.createBFSWorkload(init_addr, init_value) - def create_pr_workload(self, alpha, threshold): - self.ctrl.createPRWorkload(alpha, threshold) + def create_pr_workload(self, alpha): + self.ctrl.createPRWorkload(alpha) def print_answer(self): self.ctrl.printAnswerToHostSimout() - diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 8536c2bbd8..1fa2b287c4 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -111,9 +111,11 @@ std::string BFSWorkload::printWorkListItem(const WorkListItem wl) { return csprintf( - "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}", - wl.tempProp, wl.prop, wl.degree, wl.edgeIndex - ); + "WorkListItem{tempProp: %u, prop: %u, degree: %u, " + "edgeIndex: %u, activeNow: %s, activeFuture: %s}", + wl.tempProp, wl.prop, wl.degree, wl.edgeIndex, + wl.activeNow ? "true" : "false", + wl.activeFuture ? "true" : "false"); } void @@ -232,7 +234,7 @@ SSSPWorkload::printWorkListItem(const WorkListItem wl) void -PRWorkload::init(PacketPtr pkt, WorkDirectory* dir) +BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir) { size_t pkt_size = pkt->getSize(); int num_elements = (int) (pkt_size / sizeof(WorkListItem)); @@ -241,9 +243,12 @@ PRWorkload::init(PacketPtr pkt, WorkDirectory* dir) pkt->writeDataToBlock((uint8_t*) items, pkt_size); bool atom_active = false; for (int i = 0; i < num_elements; i++) { - items[i].tempProp = readFromFloat(0); - items[i].prop = readFromFloat(1 - alpha); - atom_active |= activeCondition(items[i]); + WorkListItem new_wl = items[i]; + new_wl.tempProp = readFromFloat(1 - alpha); + new_wl.prop = readFromFloat(1); + new_wl.activeNow = activeCondition(new_wl, items[i]); + atom_active |= new_wl.activeNow; + items[i] = new_wl; } if (atom_active) { dir->activate(pkt->getAddr()); @@ -254,7 +259,7 @@ PRWorkload::init(PacketPtr pkt, WorkDirectory* dir) } uint32_t -PRWorkload::reduce(uint32_t update, uint32_t value) +BSPPRWorkload::reduce(uint32_t update, uint32_t value) { float update_float = writeToFloat(update); float value_float = writeToFloat(value); @@ -262,115 +267,47 @@ PRWorkload::reduce(uint32_t update, uint32_t value) } uint32_t -PRWorkload::propagate(uint32_t value, uint32_t weight) +BSPPRWorkload::propagate(uint32_t value, uint32_t weight) { float value_float = writeToFloat(value); - float weight_float = writeToFloat(weight); - if (weight == 0) { - weight_float = 1.0; - } - return readFromFloat(alpha * value_float * weight_float); + return readFromFloat(alpha * value_float); } bool -PRWorkload::activeCondition(WorkListItem wl) +BSPPRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) { - float temp_float = writeToFloat(wl.tempProp); - float prop_float = writeToFloat(wl.prop); - float dist = std::abs(temp_float - prop_float); - return (dist >= threshold) && (wl.degree > 0); + return (old_wl.degree > 0); } uint32_t -PRWorkload::apply(WorkListItem& wl) +BSPPRWorkload::apply(WorkListItem& wl) { - float temp_float = writeToFloat(wl.tempProp); float prop_float = writeToFloat(wl.prop); - float delta = (temp_float - prop_float) / wl.degree; + float delta = prop_float / wl.degree; uint32_t delta_uint = readFromFloat(delta); - wl.prop = wl.tempProp; return delta_uint; } +void +BSPPRWorkload::interIterationInit(WorkListItem& wl) +{ + wl.prop = wl.tempProp; + wl.tempProp = readFromFloat(1 - alpha); + wl.activeFuture = (wl.degree > 0); +} + std::string -PRWorkload::printWorkListItem(const WorkListItem wl) +BSPPRWorkload::printWorkListItem(const WorkListItem wl) { float temp_float = writeToFloat(wl.tempProp); float prop_float = writeToFloat(wl.prop); return csprintf( - "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}", - temp_float, prop_float, wl.degree, wl.edgeIndex); -} -// void -// PRWorkload::init(PacketPtr pkt, WorkDirectory* dir) -// { -// size_t pkt_size = pkt->getSize(); -// int num_elements = (int) (pkt_size / sizeof(WorkListItem)); -// WorkListItem items[num_elements]; - -// pkt->writeDataToBlock((uint8_t*) items, pkt_size); -// bool atom_active = false; -// for (int i = 0; i < num_elements; i++) { -// items[i].tempProp = readFromFloat(0); -// items[i].prop = readFromFloat(1 - alpha); -// atom_active |= activeCondition(items[i]); -// } -// if (atom_active) { -// dir->activate(pkt->getAddr()); -// } -// pkt->deleteData(); -// pkt->allocate(); -// pkt->setDataFromBlock((uint8_t*) items, pkt_size); -// } - -// uint32_t -// PRWorkload::reduce(uint32_t update, uint32_t value) -// { -// float update_float = writeToFloat(update); -// float value_float = writeToFloat(value); -// return readFromFloat(update_float + value_float); -// } - -// uint32_t -// PRWorkload::propagate(uint32_t value, uint32_t weight) -// { -// float value_float = writeToFloat(value); -// float weight_float = writeToFloat(weight); -// if (weight == 0) { -// weight_float = 1.0; -// } -// return readFromFloat(alpha * value_float * weight_float); -// } - -// bool -// PRWorkload::activeCondition(WorkListItem wl) -// { -// float temp_float = writeToFloat(wl.tempProp); -// float prop_float = writeToFloat(wl.prop); -// float dist = std::abs(temp_float - prop_float); -// return (dist >= threshold) && (wl.degree > 0); -// } - -// uint32_t -// PRWorkload::apply(WorkListItem& wl) -// { -// float temp_float = writeToFloat(wl.tempProp); -// float prop_float = writeToFloat(wl.prop); -// float delta = (temp_float - prop_float) / wl.degree; -// uint32_t delta_uint = readFromFloat(delta); -// wl.prop = wl.tempProp; -// return delta_uint; -// } - -// std::string -// PRWorkload::printWorkListItem(const WorkListItem wl) -// { -// float temp_float = writeToFloat(wl.tempProp); -// float prop_float = writeToFloat(wl.prop); -// return csprintf( -// "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}", -// temp_float, prop_float, wl.degree, wl.edgeIndex); -// } + "WorkListItem{tempProp: %f, prop: %f, degree: %u, " + "edgeIndex: %u, activeNow: %s, activeFuture: %s}", + temp_float, prop_float, wl.degree, wl.edgeIndex, + wl.activeNow ? "true" : "false", + wl.activeFuture ? "true" : "false"); +} void CCWorkload::init(PacketPtr pkt, WorkDirectory* dir) diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh index 8e27d16bf9..fdd4928e10 100644 --- a/src/accl/graph/base/graph_workload.hh +++ b/src/accl/graph/base/graph_workload.hh @@ -51,6 +51,7 @@ class GraphWorkload virtual uint32_t reduce(uint32_t update, uint32_t value) = 0; virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0; virtual uint32_t apply(WorkListItem& wl) = 0; + virtual void interIterationInit(WorkListItem& wl) = 0; virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0; virtual std::string printWorkListItem(const WorkListItem wl) = 0; }; @@ -72,6 +73,7 @@ class BFSWorkload : public GraphWorkload virtual uint32_t reduce(uint32_t update, uint32_t value); virtual uint32_t propagate(uint32_t value, uint32_t weight); virtual uint32_t apply(WorkListItem& wl); + virtual void interIterationInit(WorkListItem& wl) {} virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); virtual std::string printWorkListItem(const WorkListItem wl); }; @@ -119,26 +121,24 @@ class SSSPWorkload : public GraphWorkload }; -// class PRWorkload : public GraphWorkload -// { -// private: -// float alpha; -// float threshold; +class BSPPRWorkload : public GraphWorkload +{ + private: + float alpha; -// public: -// PRWorkload(float alpha, float threshold): -// alpha(alpha), threshold(threshold) -// {} + public: + BSPPRWorkload(float alpha): alpha(alpha) {} -// ~PRWorkload() {} + ~BSPPRWorkload() {} -// virtual void init(PacketPtr pkt, WorkDirectory* dir); -// virtual uint32_t reduce(uint32_t update, uint32_t value); -// virtual uint32_t propagate(uint32_t value, uint32_t weight); -// virtual uint32_t apply(WorkListItem& wl); -// virtual bool activeCondition(WorkListItem wl); -// virtual std::string printWorkListItem(const WorkListItem wl); -// }; + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual void interIterationInit(WorkListItem& wl); + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); + virtual std::string printWorkListItem(const WorkListItem wl); +}; class CCWorkload : public GraphWorkload { diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index 6de9e03a1c..9dd8f41e61 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -46,6 +46,7 @@ class CenteralController(ClockedObject): PyBindMethod("setBSPMode"), PyBindMethod("createPopCountDirectory"), PyBindMethod("createBFSWorkload"), - # PyBindMethod("createPRWorkload"), + PyBindMethod("createPRWorkload"), + PyBindMethod("workCount"), PyBindMethod("printAnswerToHostSimout") ] diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 7de6f61b56..0103b1a0c4 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -43,8 +43,7 @@ namespace gem5 CenteralController::CenteralController(const Params& params): ClockedObject(params), system(params.system), - mode(ProcessingMode::NOT_SET), - state(BulkSynchronousState::DONT_CARE) + mode(ProcessingMode::NOT_SET) { for (auto mpu : params.mpu_vector) { mpuVector.push_back(mpu); @@ -58,11 +57,11 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value) workload = new BFSWorkload(init_addr, init_value); } -// void -// CenteralController::createPRWorkload(float alpha, float threshold) -// { -// workload = new PRWorkload(alpha, threshold); -// } +void +CenteralController::createPRWorkload(float alpha) +{ + workload = new BSPPRWorkload(alpha); +} void CenteralController::createPopCountDirectory(int atoms_per_block) @@ -113,9 +112,6 @@ CenteralController::startup() panic_if(!image.write(proxy), "%s: Unable to write image."); - if (mode == ProcessingMode::BULK_SYNCHRONOUS) { - state = BulkSynchronousState::CONSUMING; - } for (auto mpu: mpuVector) { mpu->postMemInitSetup(); if (!mpu->running() && (mpu->workCount() > 0)) { @@ -152,20 +148,25 @@ CenteralController::recvDoneSignal() } if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) { - assert(state != BulkSynchronousState::DONT_CARE); - if (state == BulkSynchronousState::APPLYING) { - // TODO: - // 1- Toggle directories - // 2- Check if termination condition is met - // 3- If yes, schedule exit event, - // 4- If not switch state to consuming. - exitSimLoopNow("applying done."); - } else if (state == BulkSynchronousState::CONSUMING) { - // TODO: - // Schedule Bulk apply - exitSimLoopNow("consuming done."); + for (auto mpu: mpuVector) { + mpu->postConsumeProcess(); + mpu->swapDirectories(); + if (!mpu->running() && (mpu->workCount() > 0)) { + mpu->start(); + } } + exitSimLoopNow("finished an iteration."); + } +} + +int +CenteralController::workCount() +{ + int work_count = 0; + for (auto mpu: mpuVector) { + work_count += mpu->workCount(); } + return work_count; } void @@ -184,7 +185,6 @@ CenteralController::printAnswerToHostSimout() } pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize()); for (int i = 0; i < num_items; i++) { - workload->apply(items[i]); std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i, workload->printWorkListItem(items[i])); diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index b32dc38385..ab039e5024 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -50,7 +50,6 @@ class CenteralController : public ClockedObject Addr maxVertexAddr; ProcessingMode mode; - BulkSynchronousState state; std::vector mpuVector; std::unordered_map addrRangeListMap; @@ -71,10 +70,11 @@ class CenteralController : public ClockedObject void createPopCountDirectory(int atoms_per_block); void createBFSWorkload(Addr init_addr, uint32_t init_value); - // void createPRWorkload(float alpha, float threshold); + void createPRWorkload(float alpha); void recvDoneSignal(); + int workCount(); void printAnswerToHostSimout(); }; diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 6b44f7395b..32b946d29f 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -111,6 +111,69 @@ CoalesceEngine::postMemInitSetup() currentDirectory->setLastAtomAddr(lastAtomAddr); } +void +CoalesceEngine::postConsumeProcess() +{ + WorkListItem items[numElementsPerLine]; + for (Addr addr = 0; addr <= lastAtomAddr; addr += peerMemoryAtomSize) { + int block_index = getBlockIndex(addr); + if (cacheBlocks[block_index].addr == addr) { + assert(cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].hasConflict); + assert(cacheBlocks[block_index].state == CacheState::IDLE); + bool atom_active_future_before = false; + bool atom_active_future_after = false; + for (int index = 0; index < numElementsPerLine; index++) { + assert(!cacheBlocks[block_index].items[index].activeNow); + // if (cacheBlocks[block_index].items[index].activeFuture) { + // graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]); + // cacheBlocks[block_index].items[index].activeNow = true; + // cacheBlocks[block_index].items[index].activeFuture = false; + // } + atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture; + graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]); + atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture; + if (cacheBlocks[block_index].items[index].activeFuture) { + cacheBlocks[block_index].items[index].activeFuture = false; + cacheBlocks[block_index].items[index].activeNow = true; + } + } + if (!atom_active_future_before && atom_active_future_after) { + futureActiveCacheBlocks.push_back(block_index); + } + if (atom_active_future_before && !atom_active_future_after) { + futureActiveCacheBlocks.erase(block_index); + } + } else { + PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize); + memPort.sendFunctional(read_pkt); + read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + delete read_pkt; + bool atom_active_future_before = false; + bool atom_active_future_after = false; + for (int index = 0; index < numElementsPerLine; index++) { + assert(!items[index].activeNow); + atom_active_future_before |= items[index].activeFuture; + graphWorkload->interIterationInit(items[index]); + atom_active_future_after |= items[index].activeFuture; + if (items[index].activeFuture) { + items[index].activeFuture = false; + items[index].activeNow = true; + } + } + if (!atom_active_future_before && atom_active_future_after) { + futureDirectory->activate(addr); + } + if (atom_active_future_before && !atom_active_future_after) { + futureDirectory->deactivate(addr); + } + PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items); + memPort.sendFunctional(write_pkt); + delete write_pkt; + } + } +} + void CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block) { diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 39f2491232..c9d8e47f15 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -205,12 +205,14 @@ class CoalesceEngine : public BaseMemoryEngine virtual void recvFunctional(PacketPtr pkt); void postMemInitSetup(); + void postConsumeProcess(); void swapDirectories(); ReadReturnStatus recvWLRead(Addr addr); void recvWLWrite(Addr addr, WorkListItem wl); int workCount(); + int futureWorkCount(); void recvVertexPull(); bool done(); diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc index 099594e9eb..f7ef96197f 100644 --- a/src/accl/graph/sega/enums.cc +++ b/src/accl/graph/sega/enums.cc @@ -59,11 +59,4 @@ const char* processingModeStrings[NUM_PROCESSING_MODE] = "BULK_SYNCHRONOUS" }; -const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE] = -{ - "DONT_CARE", - "CONSUMING", - "APPLYING" -}; - } // namespace gem5 diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh index 969ee8a976..f97c33a0e0 100644 --- a/src/accl/graph/sega/enums.hh +++ b/src/accl/graph/sega/enums.hh @@ -69,15 +69,6 @@ enum ProcessingMode }; extern const char* processingModeStrings[NUM_PROCESSING_MODE]; -enum BulkSynchronousState -{ - DONT_CARE, - CONSUMING, - APPLYING, - NUM_BULK_SYNCHRONOUS_STATE, -}; -extern const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE]; - } // namespace gem5 #endif // __ACCL_GRAPH_SEGA_ENUMS_HH__ diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index 7d75e3e0b7..04393db36d 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -70,6 +70,8 @@ class MPU : public SimObject AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); } void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); } void postMemInitSetup() { coalesceEngine->postMemInitSetup(); } + void postConsumeProcess() { coalesceEngine->postConsumeProcess(); } + void swapDirectories() { coalesceEngine->swapDirectories(); } bool handleIncomingUpdate(PacketPtr pkt); From f59afb8fb699e6ae63af78d6e4dfc165696c319f Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 13 Nov 2022 11:17:39 -0800 Subject: [PATCH 234/247] Fixing a bug in async mode. --- configs/accl/sega.py | 2 +- configs/accl/sega_simple.py | 2 +- src/accl/graph/sega/CenteralController.py | 3 ++- src/accl/graph/sega/centeral_controller.cc | 10 +++++----- src/accl/graph/sega/coalesce_engine.cc | 6 +++--- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 07e1b36d9d..b5ce618f7f 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -124,7 +124,7 @@ def __init__(self, num_mpus, cache_size, graph_path): gpts = [] for i in range(num_mpus): - gpt = GPT("16GiB", cache_size) + gpt = GPT("2GiB", cache_size) gpt.set_vertex_range( [vertex_ranges[i], vertex_ranges[i + num_mpus]] ) diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py index 8727a4c90d..ff97134b47 100644 --- a/configs/accl/sega_simple.py +++ b/configs/accl/sega_simple.py @@ -117,7 +117,7 @@ def __init__(self, num_mpus, cache_size, graph_path): gpts = [] for i in range(num_mpus): - gpt = GPT("4GiB", cache_size) + gpt = GPT("2GiB", cache_size) gpt.set_vertex_range(vertex_ranges[i]) gpt.set_edge_image(f"{graph_path}/edgelist_{i}") gpts.append(gpt) diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index 9dd8f41e61..f9544ec539 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -37,7 +37,8 @@ class CenteralController(ClockedObject): system = Param.System(Parent.any, "System this Engine is a part of") - image_file = Param.String("Path to the vertex image file.") + vertex_image_file = Param.String("Path to the vertex image file.") + edgelist_image_file = Param.String("Path to the edgelist image file.") mpu_vector = VectorParam.MPU("All mpus in the system.") diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 0103b1a0c4..c44789f9f0 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -89,7 +89,7 @@ CenteralController::startup() mpu->recvWorkload(workload); } - const auto& file = params().image_file; + const auto& vertex_file = params().vertex_image_file; if (file == "") return; @@ -97,10 +97,10 @@ CenteralController::startup() fatal_if(!object, "%s: Could not load %s.", name(), file); loader::debugSymbolTable.insert(*object->symtab().globals()); - loader::MemoryImage image = object->buildImage(); - maxVertexAddr = image.maxAddr(); + loader::MemoryImage vertex_image = object->buildImage(); + maxVertexAddr = vertex_image.maxAddr(); - PortProxy proxy( + PortProxy vertex_proxy( [this](PacketPtr pkt) { for (auto mpu: mpuVector) { AddrRangeList range_list = addrRangeListMap[mpu]; @@ -110,7 +110,7 @@ CenteralController::startup() } }, system->cacheLineSize()); - panic_if(!image.write(proxy), "%s: Unable to write image."); + panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image."); for (auto mpu: mpuVector) { mpu->postMemInitSetup(); diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 32b946d29f..35b2bf71cf 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -744,8 +744,6 @@ CoalesceEngine::processNextMemoryEvent() schedule(nextMemoryEvent, nextCycle()); } - // FIXME: done() might have a different meaning depending on - // ProcessingMode and Processing state if (done()) { owner->recvDoneSignal(); } @@ -934,7 +932,9 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) activeBuffer.emplace_back(pkt, curTick()); } else { int count = currentDirectory->activate(cacheBlocks[block_index].addr); - int count_2 = futureDirectory->activate(cacheBlocks[block_index].addr); + if (atom_active_future) { + int count_2 = futureDirectory->activate(cacheBlocks[block_index].addr); + } // stats.blockActiveCount.sample(count); // stats.frontierSize.sample(directory->workCount()); memPort.sendPacket(pkt); From 772795067298f974d713a6a605b0056e30bfe537 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 13 Nov 2022 16:03:25 -0800 Subject: [PATCH 235/247] Debugging and removing typos. sega-ddr represent correct system config. --- configs/accl/sega-ddr/bfs.py | 125 +++++++++++++ configs/accl/sega-ddr/pr.py | 128 +++++++++++++ configs/accl/sega-ddr/sega.py | 200 +++++++++++++++++++++ src/accl/graph/sega/CenteralController.py | 1 - src/accl/graph/sega/centeral_controller.cc | 6 +- src/accl/graph/sega/coalesce_engine.cc | 68 ++++--- src/accl/graph/sega/coalesce_engine.hh | 9 +- 7 files changed, 505 insertions(+), 32 deletions(-) create mode 100644 configs/accl/sega-ddr/bfs.py create mode 100644 configs/accl/sega-ddr/pr.py create mode 100644 configs/accl/sega-ddr/sega.py diff --git a/configs/accl/sega-ddr/bfs.py b/configs/accl/sega-ddr/bfs.py new file mode 100644 index 0000000000..8766822b33 --- /dev/null +++ b/configs/accl/sega-ddr/bfs.py @@ -0,0 +1,125 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.num_registers, + args.cache_size, + args.graph, + args.init_addr, + args.init_value, + args.simple, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + num_registers, + cache_size, + graph, + init_addr, + init_value, + simple, + sample, + verify, + ) = get_inputs() + + if simple: + from sega_simple import SEGA + else: + from sega import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_async_mode() + system.create_pop_count_directory(64) + system.create_bfs_workload(init_addr, init_value) + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if verify: + system.print_answer() diff --git a/configs/accl/sega-ddr/pr.py b/configs/accl/sega-ddr/pr.py new file mode 100644 index 0000000000..ea8a103640 --- /dev/null +++ b/configs/accl/sega-ddr/pr.py @@ -0,0 +1,128 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("iterations", type=int) + argparser.add_argument("graph", type=str) + argparser.add_argument("alpha", type=float) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.cache_size, + args.graph, + args.iterations, + args.alpha, + args.simple, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + cache_size, + graph, + iterations, + alpha, + simple, + sample, + verify, + ) = get_inputs() + + if simple: + from sega_simple import SEGA + else: + from sega import SEGA + system = SEGA(num_gpts, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_bsp_mode() + system.create_pop_count_directory(64) + system.create_pr_workload(alpha) + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + iteration = 0 + while iteration < iterations: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + iteration += 1 + if system.work_count() == 0: + break + print(f"#iterations: {iteration}") + if verify: + system.print_answer() diff --git a/configs/accl/sega-ddr/sega.py b/configs/accl/sega-ddr/sega.py new file mode 100644 index 0000000000..c5545ee0f1 --- /dev/null +++ b/configs/accl/sega-ddr/sega.py @@ -0,0 +1,200 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from math import log +from m5.objects import * + + +def interleave_addresses(plain_range, num_channels, cache_line_size): + intlv_low_bit = log(cache_line_size, 2) + intlv_bits = log(num_channels, 2) + ret = [] + for i in range(num_channels): + ret.append( + AddrRange( + start=plain_range.start, + size=plain_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + xorHighBit=0, + intlvBits=intlv_bits, + intlvMatch=i, + ) + ) + return ret, intlv_low_bit + intlv_bits - 1 + + +class GPT(SubSystem): + def __init__(self, register_file_size: int, cache_size: str): + super().__init__() + self.wl_engine = WLEngine( + update_queue_size=64, register_file_size=register_file_size + ) + self.coalesce_engine = CoalesceEngine( + attached_memory_atom_size=32, + cache_size=cache_size, + max_resp_per_cycle=8, + pending_pull_limit=32, + active_buffer_size=64, + post_push_wb_queue_size=64, + ) + self.push_engine = PushEngine( + push_req_queue_size=32, + attached_memory_atom_size=64, + resp_queue_size=4096, + max_propagates_per_cycle=8, + update_queue_size=32, + ) + + self.vertex_mem_ctrl = HBMCtrl( + dram=HBM_2000_4H_1x64( + page_policy="close", read_buffer_size=96, write_buffer_size=96 + ), + dram_2=HBM_2000_4H_1x64( + page_policy="close", read_buffer_size=96, write_buffer_size=96 + ), + ) + self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port + + self.mpu = MPU( + wl_engine=self.wl_engine, + coalesce_engine=self.coalesce_engine, + push_engine=self.push_engine, + ) + + def getRespPort(self): + return self.wl_engine.in_ports + + def setRespPort(self, port): + self.wl_engine.in_ports = port + + def getReqPort(self): + return self.push_engine.out_ports + + def setReqPort(self, port): + self.push_engine.out_ports = port + + def getEdgeMemPort(self): + return self.push_engine.mem_port + + def setEdgeMemPort(self, port): + self.push_engine.mem_port = port + + def set_vertex_range(self, vertex_ranges): + self.vertex_mem_ctrl.dram.range = vertex_ranges[0] + self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1] + + def set_vertex_pch_bit(self, pch_bit): + self.vertex_mem_ctrl.pch_bit = pch_bit + + +class EdgeMemory(SubSystem): + def __init__(self, size: str): + super(EdgeMemory, self).__init__() + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2.4GHz" + self.clk_domain.voltage_domain = VoltageDomain() + + self.mem_ctrl = MemCtrl( + dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False) + ) + self.xbar = NoncoherentXBar( + width=8, frontend_latency=1, forward_latency=1, response_latency=1 + ) + self.xbar.mem_side_ports = self.mem_ctrl.port + + def set_image(self, image): + self.mem_ctrl.dram.image_file = image + + def getPort(self): + return self.xbar.cpu_side_ports + + def setPort(self, port): + self.xbar.cpu_side_ports = port + +class SEGA(System): + def __init__(self, num_gpts, num_registers, cache_size, graph_path): + super(SEGA, self).__init__() + # num_gpts should be an even power of 2 + assert num_gpts != 0 + assert num_gpts % 2 == 0 + assert (num_gpts & (num_gpts - 1)) == 0 + + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2GHz" + self.clk_domain.voltage_domain = VoltageDomain() + self.cache_line_size = 32 + self.mem_mode = "timing" + + # Building the CenteralController + self.ctrl = CenteralController(vertex_image_file=f"{graph_path}/vertices") + # Building the EdgeMemories + edge_mem = [] + for i in range(int(num_gpts/2)): + mem = EdgeMemory("16GiB") + mem.set_image(f"{graph_path}/edgelist_{i}") + edge_mem.append(mem) + self.edge_mem = edge_mem + # Building the GPTs + vertex_ranges, pch_bit = interleave_addresses( + AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32 + ) + gpts = [] + for i in range(num_gpts): + gpt = GPT(num_registers, cache_size) + gpt.set_vertex_range( + [vertex_ranges[i], vertex_ranges[i + num_gpts]] + ) + gpt.set_vertex_pch_bit(pch_bit) + gpt.setEdgeMemPort(self.edge_mem[i % (int(num_gpts/2))].getPort()) + gpts.append(gpt) + # Creating the interconnect among mpus + for gpt_0 in gpts: + for gpt_1 in gpts: + gpt_0.setReqPort(gpt_1.getRespPort()) + self.gpts = gpts + + self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] + + def work_count(self): + return self.ctrl.workCount() + + def set_async_mode(self): + self.ctrl.setAsyncMode() + + def set_bsp_mode(self): + self.ctrl.setBSPMode() + + def create_pop_count_directory(self, atoms_per_block): + self.ctrl.createPopCountDirectory(atoms_per_block) + + def create_bfs_workload(self, init_addr, init_value): + self.ctrl.createBFSWorkload(init_addr, init_value) + + def create_pr_workload(self, alpha): + self.ctrl.createPRWorkload(alpha) + + def print_answer(self): + self.ctrl.printAnswerToHostSimout() diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index f9544ec539..bda2fa3d6a 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -38,7 +38,6 @@ class CenteralController(ClockedObject): system = Param.System(Parent.any, "System this Engine is a part of") vertex_image_file = Param.String("Path to the vertex image file.") - edgelist_image_file = Param.String("Path to the edgelist image file.") mpu_vector = VectorParam.MPU("All mpus in the system.") diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index c44789f9f0..26e4473b03 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -90,11 +90,11 @@ CenteralController::startup() } const auto& vertex_file = params().vertex_image_file; - if (file == "") + if (vertex_file == "") return; - auto* object = loader::createObjectFile(file, true); - fatal_if(!object, "%s: Could not load %s.", name(), file); + auto* object = loader::createObjectFile(vertex_file, true); + fatal_if(!object, "%s: Could not load %s.", name(), vertex_file); loader::debugSymbolTable.insert(*object->symtab().globals()); loader::MemoryImage vertex_image = object->buildImage(); diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 35b2bf71cf..263e08d901 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -343,12 +343,14 @@ CoalesceEngine::recvWLRead(Addr addr) if (atom_active_now) { currentActiveCacheBlocks.erase(block_index); int count = currentDirectory->activate(cacheBlocks[block_index].addr); - // stats.blockActiveCount.sample(count); - // stats.frontierSize.sample(directory->workCount()); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); } if (atom_active_future) { futureActiveCacheBlocks.erase(block_index); int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); } // NOTE: Bring the cache line to invalid state. // NOTE: Above line where we set hasConflict to true @@ -457,14 +459,16 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; } if (atom_active_now) { - // TODO: Add sampling of blockActiveCount and frontierSize here int count = currentDirectory->deactivate(addr); currentActiveCacheBlocks.push_back(block_index); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); } if (atom_active_future) { - // TODO: Add sampling of blockActiveCount and frontierSize here int count = futureDirectory->deactivate(addr); futureActiveCacheBlocks.push_back(block_index); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); } assert(MSHR.find(block_index) != MSHR.end()); @@ -522,15 +526,17 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) atom_active_future |= items[index].activeFuture; } if (atom_active_now) { - // TODO: Add sampling of blockActiveCount and frontierSize here int count = currentDirectory->deactivate(addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); if (atom_active_future) { - int count_2 = futureDirectory->deactivate(addr); + int count = futureDirectory->deactivate(addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); } activeBuffer.emplace_back(pkt, curTick()); - // stats.blockActiveCount.sample(count); - // stats.frontierSize.sample(directory->workCount()); } else { + stats.wastefulBytesRead += pkt->getSize(); delete pkt; } @@ -686,15 +692,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; } if (atom_active_now) { - // TODO: Sample frontier size and blockCount here. currentActiveCacheBlocks.erase(block_index); int count = currentDirectory->activate(cacheBlocks[block_index].addr); - // stats.blockActiveCount.sample(count); - // stats.frontierSize.sample(directory->workCount()); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); } if (atom_active_future) { futureActiveCacheBlocks.erase(block_index); int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); } cacheBlocks[block_index].reset(); } @@ -932,17 +939,21 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) activeBuffer.emplace_back(pkt, curTick()); } else { int count = currentDirectory->activate(cacheBlocks[block_index].addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); if (atom_active_future) { - int count_2 = futureDirectory->activate(cacheBlocks[block_index].addr); + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); } - // stats.blockActiveCount.sample(count); - // stats.frontierSize.sample(directory->workCount()); memPort.sendPacket(pkt); onTheFlyReqs++; } } else { if (atom_active_future) { int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); } memPort.sendPacket(pkt); onTheFlyReqs++; @@ -956,7 +967,6 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) "the current write back scheduled at tick %lu for " "the right function scheduled later.\n", __func__, block_index, schedule_tick); - stats.numInvalidWriteBacks++; } } @@ -1141,8 +1151,8 @@ CoalesceEngine::processNextApplyEvent() } } } else { - DPRINTF(CoalesceEngine, "%s: Could not find " - "work to apply.\n", __func__); + DPRINTF(CoalesceEngine, "%s: Could not find work to apply.\n", __func__); + stats.worklessCycles++; } if (pullCondition()) { @@ -1184,6 +1194,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "delayed because of port shortage. "), ADD_STAT(numMemoryBlocks, statistics::units::Count::get(), "Number of times memory bandwidth was not available."), + ADD_STAT(wastefulBytesRead, statistics::units::Byte::get(), + "Number of bytes read that were not used by coalesce engine"), ADD_STAT(verticesPulled, statistics::units::Count::get(), "Number of times a pull request has been sent by PushEngine."), ADD_STAT(verticesPushed, statistics::units::Count::get(), @@ -1192,8 +1204,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) "Time of the last pull request. (Relative to reset_stats)"), ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(), "Time of the last vertex push. (Relative to reset_stats)"), - ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(), - "Number of times a scheduled memory function has been invalid."), + ADD_STAT(worklessCycles, statistics::units::Count::get(), + "cycles the coalesce engine could not find work for apply"), ADD_STAT(hitRate, statistics::units::Ratio::get(), "Hit rate in the cache."), ADD_STAT(vertexPullBW, statistics::units::Rate::get(), "Rate at which vertices are pushed."), - ADD_STAT(frontierSize, statistics::units::Count::get(), - "Histogram of the length of the bitvector."), - ADD_STAT(blockActiveCount, statistics::units::Count::get(), - "Histogram of the popCount values in the directory"), + ADD_STAT(currentFrontierSize, statistics::units::Count::get(), + "Histogram of the length of the current bitvector."), + ADD_STAT(futureFrontierSize, statistics::units::Count::get(), + "Histogram of the length of the future bitvector."), + ADD_STAT(currentBlockActiveCount, statistics::units::Count::get(), + "Histogram of the popCount values in the current directory"), + ADD_STAT(futureBlockActiveCount, statistics::units::Count::get(), + "Histogram of the popCount values in the future directory"), ADD_STAT(responseQueueLatency, statistics::units::Second::get(), "Histogram of the response latency to WLEngine. (ns)"), ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(), @@ -1225,8 +1241,10 @@ CoalesceEngine::CoalesceStats::regStats() vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime; - frontierSize.init(64); - blockActiveCount.init(64); + currentFrontierSize.init(64); + futureFrontierSize.init(64); + currentBlockActiveCount.init(64); + futureBlockActiveCount.init(64); responseQueueLatency.init(64); memoryFunctionLatency.init(64); } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index c9d8e47f15..8ee17781fc 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -171,18 +171,21 @@ class CoalesceEngine : public BaseMemoryEngine statistics::Scalar numConflicts; statistics::Scalar responsePortShortage; statistics::Scalar numMemoryBlocks; + statistics::Scalar wastefulBytesRead; statistics::Scalar verticesPulled; statistics::Scalar verticesPushed; statistics::Scalar lastVertexPullTime; statistics::Scalar lastVertexPushTime; - statistics::Scalar numInvalidWriteBacks; + statistics::Scalar worklessCycles; statistics::Formula hitRate; statistics::Formula vertexPullBW; statistics::Formula vertexPushBW; - statistics::Histogram frontierSize; - statistics::Histogram blockActiveCount; + statistics::Histogram currentFrontierSize; + statistics::Histogram futureFrontierSize; + statistics::Histogram currentBlockActiveCount; + statistics::Histogram futureBlockActiveCount; statistics::Histogram responseQueueLatency; statistics::Histogram memoryFunctionLatency; }; From 93624ccbddc96f8a561c97a4864f6894d708d528 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Sun, 13 Nov 2022 22:51:41 -0800 Subject: [PATCH 236/247] Debugging, finalizing the config and merging new workloads. --- configs/accl/sega-ddr/bfs.py | 15 +- configs/accl/sega-ddr/cc.py | 119 +++++++++++ configs/accl/sega-ddr/sega.py | 15 +- configs/accl/sega-ddr/sssp.py | 125 +++++++++++ src/accl/graph/base/graph_workload.cc | 233 +++++++-------------- src/accl/graph/base/graph_workload.hh | 81 ++++--- src/accl/graph/sega/CenteralController.py | 3 + src/accl/graph/sega/centeral_controller.cc | 18 ++ src/accl/graph/sega/centeral_controller.hh | 3 + src/accl/graph/sega/push_engine.cc | 1 - 10 files changed, 408 insertions(+), 205 deletions(-) create mode 100644 configs/accl/sega-ddr/cc.py create mode 100644 configs/accl/sega-ddr/sssp.py diff --git a/configs/accl/sega-ddr/bfs.py b/configs/accl/sega-ddr/bfs.py index 8766822b33..97f1b5dc21 100644 --- a/configs/accl/sega-ddr/bfs.py +++ b/configs/accl/sega-ddr/bfs.py @@ -39,6 +39,14 @@ def get_inputs(): argparser.add_argument("graph", type=str) argparser.add_argument("init_addr", type=int) argparser.add_argument("init_value", type=int) + argparser.add_argument( + "--visited", + dest="visited", + action="store_const", + const=True, + default=False, + help="Use visitation version of BFS", + ) argparser.add_argument( "--simple", dest="simple", @@ -73,6 +81,7 @@ def get_inputs(): args.graph, args.init_addr, args.init_value, + args.visited, args.simple, args.sample, args.verify, @@ -87,6 +96,7 @@ def get_inputs(): graph, init_addr, init_value, + visited, simple, sample, verify, @@ -103,7 +113,10 @@ def get_inputs(): system.set_async_mode() system.create_pop_count_directory(64) - system.create_bfs_workload(init_addr, init_value) + if visited: + system.create_bfs_visited_workload(init_addr, init_value) + else: + system.create_bfs_workload(init_addr, init_value) if sample: while True: exit_event = m5.simulate(100000000) diff --git a/configs/accl/sega-ddr/cc.py b/configs/accl/sega-ddr/cc.py new file mode 100644 index 0000000000..9b6d2b587d --- /dev/null +++ b/configs/accl/sega-ddr/cc.py @@ -0,0 +1,119 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.num_registers, + args.cache_size, + args.graph, + args.simple, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + num_registers, + cache_size, + graph, + simple, + sample, + verify, + ) = get_inputs() + + if simple: + from sega_simple import SEGA + else: + from sega import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_async_mode() + system.create_pop_count_directory(64) + system.create_cc_workload() + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if verify: + system.print_answer() diff --git a/configs/accl/sega-ddr/sega.py b/configs/accl/sega-ddr/sega.py index c5545ee0f1..8325cf7565 100644 --- a/configs/accl/sega-ddr/sega.py +++ b/configs/accl/sega-ddr/sega.py @@ -56,8 +56,8 @@ def __init__(self, register_file_size: int, cache_size: str): attached_memory_atom_size=32, cache_size=cache_size, max_resp_per_cycle=8, - pending_pull_limit=32, - active_buffer_size=64, + pending_pull_limit=64, + active_buffer_size=80, post_push_wb_queue_size=64, ) self.push_engine = PushEngine( @@ -121,7 +121,7 @@ def __init__(self, size: str): dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False) ) self.xbar = NoncoherentXBar( - width=8, frontend_latency=1, forward_latency=1, response_latency=1 + width=64, frontend_latency=1, forward_latency=1, response_latency=1 ) self.xbar.mem_side_ports = self.mem_ctrl.port @@ -193,6 +193,15 @@ def create_pop_count_directory(self, atoms_per_block): def create_bfs_workload(self, init_addr, init_value): self.ctrl.createBFSWorkload(init_addr, init_value) + def create_bfs_visited_workload(self, init_addr, init_value): + self.ctrl.createBFSVisitedWorkload(init_addr, init_value) + + def create_sssp_workload(self, init_addr, init_value): + self.ctrl.createSSSPWorkload(init_addr, init_value) + + def create_cc_workload(self): + self.ctrl.createCCWorkload() + def create_pr_workload(self, alpha): self.ctrl.createPRWorkload(alpha) diff --git a/configs/accl/sega-ddr/sssp.py b/configs/accl/sega-ddr/sssp.py new file mode 100644 index 0000000000..f2e60b856a --- /dev/null +++ b/configs/accl/sega-ddr/sssp.py @@ -0,0 +1,125 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.num_registers, + args.cache_size, + args.graph, + args.init_addr, + args.init_value, + args.simple, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + num_registers, + cache_size, + graph, + init_addr, + init_value, + simple, + sample, + verify, + ) = get_inputs() + + if simple: + from sega_simple import SEGA + else: + from sega import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_async_mode() + system.create_pop_count_directory(64) + system.create_sssp_workload(init_addr, init_value) + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if verify: + system.print_answer() diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 1fa2b287c4..7471e4d073 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -118,90 +118,95 @@ BFSWorkload::printWorkListItem(const WorkListItem wl) wl.activeFuture ? "true" : "false"); } -void -BFSVisitedWorkload::init(PacketPtr pkt, WorkDirectory* dir) -{ - size_t pkt_size = pkt->getSize(); - uint64_t aligned_addr = roundDown(initAddr, pkt_size); - - if (pkt->getAddr() == aligned_addr) { - int num_elements = (int) (pkt_size / sizeof(WorkListItem)); - WorkListItem items[num_elements]; - - pkt->writeDataToBlock((uint8_t*) items, pkt_size); - - int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem)); - items[index].tempProp = initValue; - if (activeCondition(items[index])) { - dir->activate(aligned_addr); - } - pkt->deleteData(); - pkt->allocate(); - pkt->setDataFromBlock((uint8_t*) items, pkt_size); - } -} +// void +// BFSVisitedWorkload::init(PacketPtr pkt, WorkDirectory* dir) +// { +// size_t pkt_size = pkt->getSize(); +// uint64_t aligned_addr = roundDown(initAddr, pkt_size); + +// if (pkt->getAddr() == aligned_addr) { +// int num_elements = (int) (pkt_size / sizeof(WorkListItem)); +// WorkListItem items[num_elements]; + +// pkt->writeDataToBlock((uint8_t*) items, pkt_size); + +// int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem)); +// items[index].tempProp = initValue; +// if (activeCondition(items[index])) { +// dir->activate(aligned_addr); +// } +// pkt->deleteData(); +// pkt->allocate(); +// pkt->setDataFromBlock((uint8_t*) items, pkt_size); +// } +// } + +// uint32_t +// BFSVisitedWorkload::reduce(uint32_t update, uint32_t value) +// { +// return std::min(update, value); +// } + +// uint32_t +// BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) +// { +// return 1; +// } + +// bool +// BFSVisitedWorkload::activeCondition(WorkListItem wl) +// { +// return (wl.tempProp < wl.prop) && (wl.degree > 0); +// } + +// uint32_t +// BFSVisitedWorkload::apply(WorkListItem& wl) +// { +// wl.prop = wl.tempProp; +// return wl.prop; +// } + +// std::string +// BFSVisitedWorkload::printWorkListItem(const WorkListItem wl) +// { +// return csprintf( +// "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}", +// wl.tempProp, wl.prop, wl.degree, wl.edgeIndex +// ); +// } uint32_t -BFSVisitedWorkload::reduce(uint32_t update, uint32_t value) -{ - return std::min(update, value); -} - -uint32_t -BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) -{ - return 1; -} - -bool -BFSVisitedWorkload::activeCondition(WorkListItem wl) -{ - return (wl.tempProp < wl.prop) && (wl.degree > 0); -} - -uint32_t -BFSVisitedWorkload::apply(WorkListItem& wl) -{ - wl.prop = wl.tempProp; - return wl.prop; -} - -std::string -BFSVisitedWorkload::printWorkListItem(const WorkListItem wl) -{ - return csprintf( - "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}", - wl.tempProp, wl.prop, wl.degree, wl.edgeIndex - ); +BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) { + return value; } void -SSSPWorkload::init(PacketPtr pkt, WorkDirectory* dir) +CCWorkload::init(PacketPtr pkt, WorkDirectory* dir) { + Addr pkt_addr = pkt->getAddr(); size_t pkt_size = pkt->getSize(); - uint64_t aligned_addr = roundDown(initAddr, pkt_size); - - if (pkt->getAddr() == aligned_addr) { - int num_elements = (int) (pkt_size / sizeof(WorkListItem)); - WorkListItem items[num_elements]; - - pkt->writeDataToBlock((uint8_t*) items, pkt_size); + int num_elements = (int) (pkt_size / sizeof(WorkListItem)); + WorkListItem items[num_elements]; - int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem)); - items[index].tempProp = initValue; - if (activeCondition(items[index])) { - dir->activate(aligned_addr); + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + bool atom_active = false; + for (int i = 0; i < num_elements; i++) { + WorkListItem new_wl = items[i]; + new_wl.tempProp = (int) (pkt_addr / sizeof(WorkListItem)) + i; + bool vertex_active = activeCondition(new_wl, items[i]); + if (vertex_active) { + new_wl.activeNow = true; } - pkt->deleteData(); - pkt->allocate(); - pkt->setDataFromBlock((uint8_t*) items, pkt_size); - } -} + items[i] = new_wl; + atom_active |= vertex_active; -uint32_t -SSSPWorkload::reduce(uint32_t update, uint32_t value) -{ - return std::min(update, value); + } + if (atom_active) { + dir->activate(pkt->getAddr()); + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); } uint32_t @@ -210,29 +215,6 @@ SSSPWorkload::propagate(uint32_t value, uint32_t weight) return value + weight; } -bool -SSSPWorkload::activeCondition(WorkListItem wl) -{ - return (wl.tempProp < wl.prop) && (wl.degree > 0); -} - -uint32_t -SSSPWorkload::apply(WorkListItem& wl) -{ - wl.prop = wl.tempProp; - return wl.prop; -} - -std::string -SSSPWorkload::printWorkListItem(const WorkListItem wl) -{ - return csprintf( - "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}", - wl.tempProp, wl.prop, wl.degree, wl.edgeIndex - ); -} - - void BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir) { @@ -309,61 +291,4 @@ BSPPRWorkload::printWorkListItem(const WorkListItem wl) wl.activeFuture ? "true" : "false"); } -void -CCWorkload::init(PacketPtr pkt, WorkDirectory* dir) -{ - Addr pkt_addr = pkt->getAddr(); - size_t pkt_size = pkt->getSize(); - int num_elements = (int) (pkt_size / sizeof(WorkListItem)); - WorkListItem items[num_elements]; - - pkt->writeDataToBlock((uint8_t*) items, pkt_size); - bool atom_active = false; - for (int i = 0; i < num_elements; i++) { - items[i].tempProp = (int) ( pkt_addr / sizeof(WorkListItem)) + i; - items[i].prop = -1; - atom_active |= activeCondition(items[i]); - } - if (atom_active) { - dir->activate(pkt->getAddr()); - } - pkt->deleteData(); - pkt->allocate(); - pkt->setDataFromBlock((uint8_t*) items, pkt_size); -} - -uint32_t -CCWorkload::reduce(uint32_t update, uint32_t value) -{ - return std::min(update, value); -} - -uint32_t -CCWorkload::propagate(uint32_t value, uint32_t weight) -{ - return value; -} - -bool -CCWorkload::activeCondition(WorkListItem wl) -{ - return (wl.tempProp < wl.prop) && (wl.degree > 0); -} - -uint32_t -CCWorkload::apply(WorkListItem& wl) -{ - wl.prop = wl.tempProp; - return wl.prop; -} - -std::string -CCWorkload::printWorkListItem(const WorkListItem wl) -{ - return csprintf( - "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}", - wl.tempProp, wl.prop, wl.degree, wl.edgeIndex - ); -} - } // namespace gem5 diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh index fdd4928e10..fa722a634e 100644 --- a/src/accl/graph/base/graph_workload.hh +++ b/src/accl/graph/base/graph_workload.hh @@ -78,49 +78,31 @@ class BFSWorkload : public GraphWorkload virtual std::string printWorkListItem(const WorkListItem wl); }; -class BFSVisitedWorkload : public GraphWorkload +class BFSVisitedWorkload : public BFSWorkload { - private: - uint64_t initAddr; - uint32_t initValue; - public: - BFSVisitedWorkload(uint64_t init_addr, uint32_t init_value): - initAddr(init_addr), initValue(init_value) + BFSVisitedWorkload(Addr init_addr, uint32_t init_value): + BFSWorkload(init_addr, init_value) {} + virtual uint32_t propagate(uint32_t value, uint32_t weight) override; +}; - ~BFSVisitedWorkload() {} - +class CCWorkload : public BFSVisitedWorkload +{ + public: + CCWorkload(): BFSVisitedWorkload(0, 0) {} virtual void init(PacketPtr pkt, WorkDirectory* dir); - virtual uint32_t reduce(uint32_t update, uint32_t value); - virtual uint32_t propagate(uint32_t value, uint32_t weight); - virtual uint32_t apply(WorkListItem& wl); - virtual bool activeCondition(WorkListItem wl); - virtual std::string printWorkListItem(const WorkListItem wl); }; -class SSSPWorkload : public GraphWorkload +class SSSPWorkload : public BFSWorkload { - private: - uint64_t initAddr; - uint32_t initValue; - public: - SSSPWorkload(uint64_t init_addr, uint32_t init_value): - initAddr(init_addr), initValue(init_value) + SSSPWorkload(Addr init_addr, uint32_t init_value): + BFSWorkload(init_addr, init_value) {} - - ~SSSPWorkload() {} - - virtual void init(PacketPtr pkt, WorkDirectory* dir); - virtual uint32_t reduce(uint32_t update, uint32_t value); - virtual uint32_t propagate(uint32_t value, uint32_t weight); - virtual uint32_t apply(WorkListItem& wl); - virtual bool activeCondition(WorkListItem wl); - virtual std::string printWorkListItem(const WorkListItem wl); + virtual uint32_t propagate(uint32_t value, uint32_t weight) override; }; - class BSPPRWorkload : public GraphWorkload { private: @@ -140,21 +122,28 @@ class BSPPRWorkload : public GraphWorkload virtual std::string printWorkListItem(const WorkListItem wl); }; -class CCWorkload : public GraphWorkload -{ - - public: - CCWorkload() {} - - ~CCWorkload() {} - - virtual void init(PacketPtr pkt, WorkDirectory* dir); - virtual uint32_t reduce(uint32_t update, uint32_t value); - virtual uint32_t propagate(uint32_t value, uint32_t weight); - virtual uint32_t apply(WorkListItem& wl); - virtual bool activeCondition(WorkListItem wl); - virtual std::string printWorkListItem(const WorkListItem wl); -}; +// class BSPBCWorkload : public GraphWorkload +// { +// private: +// int currentDepth; +// Addr initAddr; +// uint32_t initValue; + +// public: +// BSPBCWorkload(Addr init_addr, uint32_t init_value): +// currentDepth(1), initAddr(init_addr), initValue(init_value) +// {} + +// ~BSPBCWorkload() {} + +// virtual void init(PacketPtr pkt, WorkDirectory* dir); +// virtual uint32_t reduce(uint32_t update, uint32_t value); +// virtual uint32_t propagate(uint32_t value, uint32_t weight); +// virtual uint32_t apply(WorkListItem& wl); +// virtual void interIterationInit(WorkListItem& wl); +// virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); +// virtual std::string printWorkListItem(const WorkListItem wl); +// }; } diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index bda2fa3d6a..f3210a8ec3 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -46,6 +46,9 @@ class CenteralController(ClockedObject): PyBindMethod("setBSPMode"), PyBindMethod("createPopCountDirectory"), PyBindMethod("createBFSWorkload"), + PyBindMethod("createBFSVisitedWorkload"), + PyBindMethod("createSSSPWorkload"), + PyBindMethod("createCCWorkload"), PyBindMethod("createPRWorkload"), PyBindMethod("workCount"), PyBindMethod("printAnswerToHostSimout") diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 26e4473b03..8414aee259 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -57,6 +57,24 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value) workload = new BFSWorkload(init_addr, init_value); } +void +CenteralController::createBFSVisitedWorkload(Addr init_addr, uint32_t init_value) +{ + workload = new BFSVisitedWorkload(init_addr, init_value); +} + +void +CenteralController::createSSSPWorkload(Addr init_addr, uint32_t init_value) +{ + workload = new SSSPWorkload(init_addr, init_value); +} + +void +CenteralController::createCCWorkload() +{ + workload = new CCWorkload(); +} + void CenteralController::createPRWorkload(float alpha) { diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index ab039e5024..aa3938353d 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -70,6 +70,9 @@ class CenteralController : public ClockedObject void createPopCountDirectory(int atoms_per_block); void createBFSWorkload(Addr init_addr, uint32_t init_value); + void createBFSVisitedWorkload(Addr init_addr, uint32_t init_value); + void createSSSPWorkload(Addr init_addr, uint32_t init_value); + void createCCWorkload(); void createPRWorkload(float alpha); void recvDoneSignal(); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index 09f29a43e4..a8c9a1bcb1 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -411,7 +411,6 @@ PushEngine::createUpdatePacket(Addr addr, T value) // bits req->setPC(((Addr) 1) << 2); - // FIXME: MemCmd::UpdateWL PacketPtr pkt = new Packet(req, MemCmd::UpdateWL); pkt->allocate(); From aee9d09f4fbf08f7a2c6f4a81957a82546a8f0bf Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 14 Nov 2022 11:15:59 -0800 Subject: [PATCH 237/247] Fixing port proxy bug of limiting size to int. --- src/accl/graph/base/graph_workload.cc | 8 ++------ src/accl/graph/sega/centeral_controller.cc | 12 +++++++----- src/accl/graph/sega/mpu.hh | 1 + src/mem/port_proxy.cc | 6 +++--- src/mem/port_proxy.hh | 18 +++++++++--------- src/mem/translating_port_proxy.cc | 6 +++--- src/mem/translating_port_proxy.hh | 6 +++--- 7 files changed, 28 insertions(+), 29 deletions(-) diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 7471e4d073..38f11778b6 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -193,13 +193,9 @@ CCWorkload::init(PacketPtr pkt, WorkDirectory* dir) for (int i = 0; i < num_elements; i++) { WorkListItem new_wl = items[i]; new_wl.tempProp = (int) (pkt_addr / sizeof(WorkListItem)) + i; - bool vertex_active = activeCondition(new_wl, items[i]); - if (vertex_active) { - new_wl.activeNow = true; - } + new_wl.activeNow = activeCondition(new_wl, items[i]); + atom_active |= new_wl.activeNow; items[i] = new_wl; - atom_active |= vertex_active; - } if (atom_active) { dir->activate(pkt->getAddr()); diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 8414aee259..970a0572c5 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -101,6 +101,7 @@ CenteralController::createPopCountDirectory(int atoms_per_block) void CenteralController::startup() { + unsigned int vertex_atom = mpuVector.front()->vertexAtomSize(); for (auto mpu: mpuVector) { addrRangeListMap[mpu] = mpu->getAddrRanges(); mpu->setProcessingMode(mode); @@ -126,7 +127,7 @@ CenteralController::startup() mpu->recvFunctional(pkt); } } - }, system->cacheLineSize()); + }, vertex_atom); panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image."); @@ -190,18 +191,19 @@ CenteralController::workCount() void CenteralController::printAnswerToHostSimout() { - int num_items = system->cacheLineSize() / sizeof(WorkListItem); + unsigned int vertex_atom = mpuVector.front()->vertexAtomSize(); + int num_items = vertex_atom / sizeof(WorkListItem); WorkListItem items[num_items]; - for (Addr addr = 0; addr < maxVertexAddr; addr += system->cacheLineSize()) + for (Addr addr = 0; addr < maxVertexAddr; addr += vertex_atom) { - PacketPtr pkt = createReadPacket(addr, system->cacheLineSize()); + PacketPtr pkt = createReadPacket(addr, vertex_atom); for (auto mpu: mpuVector) { AddrRangeList range_list = addrRangeListMap[mpu]; if (contains(range_list, addr)) { mpu->recvFunctional(pkt); } } - pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize()); + pkt->writeDataToBlock((uint8_t*) items, vertex_atom); for (int i = 0; i < num_items; i++) { std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i, workload->printWorkListItem(items[i])); diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index 04393db36d..95d3adeca5 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -67,6 +67,7 @@ class MPU : public SimObject void createAsyncPopCountDirectory(int atoms_per_block) { coalesceEngine->createAsyncPopCountDirectory(atoms_per_block); } void createBSPPopCountDirectory(int atoms_per_block) { coalesceEngine->createBSPPopCountDirectory(atoms_per_block); } + unsigned int vertexAtomSize() { return coalesceEngine->params().attached_memory_atom_size; } AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); } void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); } void postMemInitSetup() { coalesceEngine->postMemInitSetup(); } diff --git a/src/mem/port_proxy.cc b/src/mem/port_proxy.cc index 19e1a53e84..55145ab7d7 100644 --- a/src/mem/port_proxy.cc +++ b/src/mem/port_proxy.cc @@ -56,7 +56,7 @@ PortProxy::PortProxy(const RequestPort &port, unsigned int cache_line_size) : void PortProxy::readBlobPhys(Addr addr, Request::Flags flags, - void *p, int size) const + void *p, Addr size) const { for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done(); gen.next()) { @@ -73,7 +73,7 @@ PortProxy::readBlobPhys(Addr addr, Request::Flags flags, void PortProxy::writeBlobPhys(Addr addr, Request::Flags flags, - const void *p, int size) const + const void *p, Addr size) const { for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done(); gen.next()) { @@ -90,7 +90,7 @@ PortProxy::writeBlobPhys(Addr addr, Request::Flags flags, void PortProxy::memsetBlobPhys(Addr addr, Request::Flags flags, - uint8_t v, int size) const + uint8_t v, Addr size) const { // quick and dirty... uint8_t *buf = new uint8_t[size]; diff --git a/src/mem/port_proxy.hh b/src/mem/port_proxy.hh index 29f6ba60a4..8cd21322ea 100644 --- a/src/mem/port_proxy.hh +++ b/src/mem/port_proxy.hh @@ -120,19 +120,19 @@ class PortProxy : FunctionalRequestProtocol * Read size bytes memory at physical address and store in p. */ void readBlobPhys(Addr addr, Request::Flags flags, - void *p, int size) const; + void *p, Addr size) const; /** * Write size bytes from p to physical address. */ void writeBlobPhys(Addr addr, Request::Flags flags, - const void *p, int size) const; + const void *p, Addr size) const; /** * Fill size bytes starting at physical addr with byte value val. */ void memsetBlobPhys(Addr addr, Request::Flags flags, - uint8_t v, int size) const; + uint8_t v, Addr size) const; @@ -143,7 +143,7 @@ class PortProxy : FunctionalRequestProtocol * Returns true on success and false on failure. */ virtual bool - tryReadBlob(Addr addr, void *p, int size) const + tryReadBlob(Addr addr, void *p, Addr size) const { readBlobPhys(addr, 0, p, size); return true; @@ -154,7 +154,7 @@ class PortProxy : FunctionalRequestProtocol * Returns true on success and false on failure. */ virtual bool - tryWriteBlob(Addr addr, const void *p, int size) const + tryWriteBlob(Addr addr, const void *p, Addr size) const { writeBlobPhys(addr, 0, p, size); return true; @@ -165,7 +165,7 @@ class PortProxy : FunctionalRequestProtocol * Returns true on success and false on failure. */ virtual bool - tryMemsetBlob(Addr addr, uint8_t val, int size) const + tryMemsetBlob(Addr addr, uint8_t val, Addr size) const { memsetBlobPhys(addr, 0, val, size); return true; @@ -179,7 +179,7 @@ class PortProxy : FunctionalRequestProtocol * Same as tryReadBlob, but insists on success. */ void - readBlob(Addr addr, void *p, int size) const + readBlob(Addr addr, void *p, Addr size) const { if (!tryReadBlob(addr, p, size)) fatal("readBlob(%#x, ...) failed", addr); @@ -189,7 +189,7 @@ class PortProxy : FunctionalRequestProtocol * Same as tryWriteBlob, but insists on success. */ void - writeBlob(Addr addr, const void *p, int size) const + writeBlob(Addr addr, const void *p, Addr size) const { if (!tryWriteBlob(addr, p, size)) fatal("writeBlob(%#x, ...) failed", addr); @@ -199,7 +199,7 @@ class PortProxy : FunctionalRequestProtocol * Same as tryMemsetBlob, but insists on success. */ void - memsetBlob(Addr addr, uint8_t v, int size) const + memsetBlob(Addr addr, uint8_t v, Addr size) const { if (!tryMemsetBlob(addr, v, size)) fatal("memsetBlob(%#x, ...) failed", addr); diff --git a/src/mem/translating_port_proxy.cc b/src/mem/translating_port_proxy.cc index 8ab859f40d..bc698c1a07 100644 --- a/src/mem/translating_port_proxy.cc +++ b/src/mem/translating_port_proxy.cc @@ -86,7 +86,7 @@ TranslatingPortProxy::tryOnBlob(BaseMMU::Mode mode, TranslationGenPtr gen, } bool -TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const +TranslatingPortProxy::tryReadBlob(Addr addr, void *p, Addr size) const { constexpr auto mode = BaseMMU::Read; return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional( @@ -99,7 +99,7 @@ TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const bool TranslatingPortProxy::tryWriteBlob( - Addr addr, const void *p, int size) const + Addr addr, const void *p, Addr size) const { constexpr auto mode = BaseMMU::Write; return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional( @@ -111,7 +111,7 @@ TranslatingPortProxy::tryWriteBlob( } bool -TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, int size) const +TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, Addr size) const { constexpr auto mode = BaseMMU::Write; return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional( diff --git a/src/mem/translating_port_proxy.hh b/src/mem/translating_port_proxy.hh index bedb57a3ce..7e619784b1 100644 --- a/src/mem/translating_port_proxy.hh +++ b/src/mem/translating_port_proxy.hh @@ -77,16 +77,16 @@ class TranslatingPortProxy : public PortProxy /** Version of tryReadblob that translates virt->phys and deals * with page boundries. */ - bool tryReadBlob(Addr addr, void *p, int size) const override; + bool tryReadBlob(Addr addr, void *p, Addr size) const override; /** Version of tryWriteBlob that translates virt->phys and deals * with page boundries. */ - bool tryWriteBlob(Addr addr, const void *p, int size) const override; + bool tryWriteBlob(Addr addr, const void *p, Addr size) const override; /** * Fill size bytes starting at addr with byte value val. */ - bool tryMemsetBlob(Addr address, uint8_t v, int size) const override; + bool tryMemsetBlob(Addr address, uint8_t v, Addr size) const override; }; } // namespace gem5 From eb22da3749dbb7f17e1464c912cb6314e6cb414b Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 14 Nov 2022 11:31:29 -0800 Subject: [PATCH 238/247] Fixing postConsumeProcess. --- src/accl/graph/sega/coalesce_engine.cc | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 263e08d901..4fa400a63a 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -115,7 +115,9 @@ void CoalesceEngine::postConsumeProcess() { WorkListItem items[numElementsPerLine]; - for (Addr addr = 0; addr <= lastAtomAddr; addr += peerMemoryAtomSize) { + Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr); + for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) { + Addr addr = peerMemoryRange.addIntlvBits(local_addr); int block_index = getBlockIndex(addr); if (cacheBlocks[block_index].addr == addr) { assert(cacheBlocks[block_index].valid); @@ -125,11 +127,6 @@ CoalesceEngine::postConsumeProcess() bool atom_active_future_after = false; for (int index = 0; index < numElementsPerLine; index++) { assert(!cacheBlocks[block_index].items[index].activeNow); - // if (cacheBlocks[block_index].items[index].activeFuture) { - // graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]); - // cacheBlocks[block_index].items[index].activeNow = true; - // cacheBlocks[block_index].items[index].activeFuture = false; - // } atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture; graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]); atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture; From 1acdbb465257bf3f57ab9b4ff2de31fc4bd8fde0 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 14 Nov 2022 18:18:28 -0800 Subject: [PATCH 239/247] Addding BC. --- src/accl/graph/base/graph_workload.cc | 157 +++++++++++++-------- src/accl/graph/base/graph_workload.hh | 52 ++++--- src/accl/graph/sega/centeral_controller.cc | 10 ++ 3 files changed, 140 insertions(+), 79 deletions(-) diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 38f11778b6..6ac2018629 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -118,63 +118,6 @@ BFSWorkload::printWorkListItem(const WorkListItem wl) wl.activeFuture ? "true" : "false"); } -// void -// BFSVisitedWorkload::init(PacketPtr pkt, WorkDirectory* dir) -// { -// size_t pkt_size = pkt->getSize(); -// uint64_t aligned_addr = roundDown(initAddr, pkt_size); - -// if (pkt->getAddr() == aligned_addr) { -// int num_elements = (int) (pkt_size / sizeof(WorkListItem)); -// WorkListItem items[num_elements]; - -// pkt->writeDataToBlock((uint8_t*) items, pkt_size); - -// int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem)); -// items[index].tempProp = initValue; -// if (activeCondition(items[index])) { -// dir->activate(aligned_addr); -// } -// pkt->deleteData(); -// pkt->allocate(); -// pkt->setDataFromBlock((uint8_t*) items, pkt_size); -// } -// } - -// uint32_t -// BFSVisitedWorkload::reduce(uint32_t update, uint32_t value) -// { -// return std::min(update, value); -// } - -// uint32_t -// BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) -// { -// return 1; -// } - -// bool -// BFSVisitedWorkload::activeCondition(WorkListItem wl) -// { -// return (wl.tempProp < wl.prop) && (wl.degree > 0); -// } - -// uint32_t -// BFSVisitedWorkload::apply(WorkListItem& wl) -// { -// wl.prop = wl.tempProp; -// return wl.prop; -// } - -// std::string -// BFSVisitedWorkload::printWorkListItem(const WorkListItem wl) -// { -// return csprintf( -// "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}", -// wl.tempProp, wl.prop, wl.degree, wl.edgeIndex -// ); -// } - uint32_t BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) { return value; @@ -287,4 +230,104 @@ BSPPRWorkload::printWorkListItem(const WorkListItem wl) wl.activeFuture ? "true" : "false"); } +void +BSPBCWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + int pkt_size = pkt->getSize(); + int aligned_addr = roundDown(initAddr, pkt_size); + + if (aligned_addr == pkt->getAddr()) { + int num_elements = pkt_size / sizeof(WorkListItem); + WorkListItem items[num_elements]; + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + int index = (initAddr - aligned_addr) / sizeof(WorkListItem); + WorkListItem new_wl = items[index]; + uint32_t prop = 0; + prop |= initValue; + // NOTE: Depth of the initial vertex is 0. + prop &= (4294967295U >> 8); + new_wl.tempProp = prop; + new_wl.prop = prop; + if (activeCondition(new_wl, items[index])) { + new_wl.activeNow = true; + dir->activate(aligned_addr); + } + items[index] = new_wl; + + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); + } +} + +uint32_t +BSPBCWorkload::reduce(uint32_t update, uint32_t value) +{ + uint32_t update_depth = (update & depthMask) >> 24; + uint32_t update_count = (update & countMask); + assert(update_depth == (currentDepth - 1)); + uint32_t value_depth = (value & depthMask) >> 24; + uint32_t value_count = (value & countMask); + if (value_depth == 255) { + value_depth = update_depth; + value_count = 0; + } + if (value_depth == currentDepth) { + value_count += update_count; + } + uint32_t ret = 0; + ret |= value_count; + warn_if(value_count > 16777215, "value count has grown bigger than 16777125." + " This means the algorithm result might not be correct." + " However, the traversal will not be affected." + " Therefore, performane metrics could be used."); + // HACK: Make sure to always set the depth correctly even if count + // exceeds the 2^24-1 limit. Here we reset the depth section of ret. + ret &= (4294967295U >> 8); + // NOTE: Now that the depth is securely reset we can copy the correct value. + ret |= (value_depth << 24); + return ret; +} + +uint32_t +BSPBCWorkload::propagate(uint32_t value, uint32_t weight) +{ + return value; +} + +uint32_t +BSPBCWorkload::apply(WorkListItem& wl) +{ + return wl.prop; +} + +void +BSPBCWorkload::interIterationInit(WorkListItem& wl) +{ + wl.prop = wl.tempProp; +} + +bool +BSPBCWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) +{ + uint32_t depth = (new_wl.tempProp & depthMask) >> 24; + return (depth == currentDepth); +} + +std::string +BSPBCWorkload::printWorkListItem(WorkListItem wl) +{ + uint32_t temp_depth = (wl.tempProp & depthMask) >> 24; + uint32_t temp_count = (wl.tempProp & countMask); + uint32_t depth = (wl.prop & depthMask) >> 24; + uint32_t count = (wl.prop & countMask); + return csprintf( + "WorkListItem{tempProp: (depth: %d, count: %d), " + "prop: (depth: %d, count: %d), degree: %u, " + "edgeIndex: %u, activeNow: %s, activeFuture: %s}", + temp_depth, temp_count, depth, count, wl.degree, wl.edgeIndex, + wl.activeNow ? "true" : "false", + wl.activeFuture ? "true" : "false"); +} + } // namespace gem5 diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh index fa722a634e..4ed3dcf3ac 100644 --- a/src/accl/graph/base/graph_workload.hh +++ b/src/accl/graph/base/graph_workload.hh @@ -51,6 +51,7 @@ class GraphWorkload virtual uint32_t reduce(uint32_t update, uint32_t value) = 0; virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0; virtual uint32_t apply(WorkListItem& wl) = 0; + virtual void iterate() = 0; virtual void interIterationInit(WorkListItem& wl) = 0; virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0; virtual std::string printWorkListItem(const WorkListItem wl) = 0; @@ -73,6 +74,7 @@ class BFSWorkload : public GraphWorkload virtual uint32_t reduce(uint32_t update, uint32_t value); virtual uint32_t propagate(uint32_t value, uint32_t weight); virtual uint32_t apply(WorkListItem& wl); + virtual void iterate() {} virtual void interIterationInit(WorkListItem& wl) {} virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); virtual std::string printWorkListItem(const WorkListItem wl); @@ -117,33 +119,39 @@ class BSPPRWorkload : public GraphWorkload virtual uint32_t reduce(uint32_t update, uint32_t value); virtual uint32_t propagate(uint32_t value, uint32_t weight); virtual uint32_t apply(WorkListItem& wl); + virtual void iterate() {} virtual void interIterationInit(WorkListItem& wl); virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); virtual std::string printWorkListItem(const WorkListItem wl); }; -// class BSPBCWorkload : public GraphWorkload -// { -// private: -// int currentDepth; -// Addr initAddr; -// uint32_t initValue; - -// public: -// BSPBCWorkload(Addr init_addr, uint32_t init_value): -// currentDepth(1), initAddr(init_addr), initValue(init_value) -// {} - -// ~BSPBCWorkload() {} - -// virtual void init(PacketPtr pkt, WorkDirectory* dir); -// virtual uint32_t reduce(uint32_t update, uint32_t value); -// virtual uint32_t propagate(uint32_t value, uint32_t weight); -// virtual uint32_t apply(WorkListItem& wl); -// virtual void interIterationInit(WorkListItem& wl); -// virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); -// virtual std::string printWorkListItem(const WorkListItem wl); -// }; +class BSPBCWorkload : public GraphWorkload +{ + private: + Addr initAddr; + uint32_t initValue; + + int currentDepth; + + uint32_t depthMask; + uint32_t countMask; + public: + BSPBCWorkload(Addr init_addr, uint32_t init_value): + currentDepth(0), initAddr(init_addr), initValue(init_value), + depthMask(4278190080), countMask(16777215) + {} + + ~BSPBCWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual void iterate() { currentDepth++; } + virtual void interIterationInit(WorkListItem& wl); + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); + virtual std::string printWorkListItem(const WorkListItem wl); +}; } diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 970a0572c5..15062f1465 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -131,6 +131,11 @@ CenteralController::startup() panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image."); + // IDEA: Should this be here or after calling start? + // Point of iterate here is to set global variables. + // At this point, we know that vertex memory has been + // initialized and we can initialize global variables. + workload->iterate(); for (auto mpu: mpuVector) { mpu->postMemInitSetup(); if (!mpu->running() && (mpu->workCount() > 0)) { @@ -170,6 +175,11 @@ CenteralController::recvDoneSignal() for (auto mpu: mpuVector) { mpu->postConsumeProcess(); mpu->swapDirectories(); + // IDEA: Should this be here or after calling start? + // Point of iterate here is to update global variables. + // At this point, we know that vertex memory has been + // updated and we can update global variables. + workload->iterate(); if (!mpu->running() && (mpu->workCount() > 0)) { mpu->start(); } From c6af36c8432cd6057cc4b3bbc0a88c007ef557f5 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 14 Nov 2022 20:58:16 -0800 Subject: [PATCH 240/247] Adding BC and degbugging. --- configs/accl/{sega-ddr/pr.py => bc.py} | 18 +- configs/accl/bfs.py | 20 +- configs/accl/{sega-ddr => }/cc.py | 0 configs/accl/sega-ddr/bfs.py | 138 -------------- configs/accl/sega-ddr/sega.py | 209 --------------------- configs/accl/sega.py | 98 +++++++--- configs/accl/sega_simple.py | 96 +++++++--- configs/accl/{sega-ddr => }/sssp.py | 0 src/accl/graph/base/graph_workload.cc | 9 +- src/accl/graph/base/graph_workload.hh | 4 +- src/accl/graph/sega/CenteralController.py | 1 + src/accl/graph/sega/centeral_controller.cc | 18 +- src/accl/graph/sega/centeral_controller.hh | 1 + 13 files changed, 195 insertions(+), 417 deletions(-) rename configs/accl/{sega-ddr/pr.py => bc.py} (90%) rename configs/accl/{sega-ddr => }/cc.py (100%) delete mode 100644 configs/accl/sega-ddr/bfs.py delete mode 100644 configs/accl/sega-ddr/sega.py rename configs/accl/{sega-ddr => }/sssp.py (100%) diff --git a/configs/accl/sega-ddr/pr.py b/configs/accl/bc.py similarity index 90% rename from configs/accl/sega-ddr/pr.py rename to configs/accl/bc.py index ea8a103640..074bee73b9 100644 --- a/configs/accl/sega-ddr/pr.py +++ b/configs/accl/bc.py @@ -34,10 +34,12 @@ def get_inputs(): argparser = argparse.ArgumentParser() argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) argparser.add_argument("cache_size", type=str) - argparser.add_argument("iterations", type=int) argparser.add_argument("graph", type=str) - argparser.add_argument("alpha", type=float) + argparser.add_argument("iterations", type=int) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) argparser.add_argument( "--simple", dest="simple", @@ -67,10 +69,12 @@ def get_inputs(): return ( args.num_gpts, + args.num_registers, args.cache_size, args.graph, args.iterations, - args.alpha, + args.init_addr, + args.init_value, args.simple, args.sample, args.verify, @@ -80,10 +84,12 @@ def get_inputs(): if __name__ == "__m5_main__": ( num_gpts, + num_registers, cache_size, graph, iterations, - alpha, + init_addr, + init_value, simple, sample, verify, @@ -93,14 +99,14 @@ def get_inputs(): from sega_simple import SEGA else: from sega import SEGA - system = SEGA(num_gpts, cache_size, graph) + system = SEGA(num_gpts, num_registers, cache_size, graph) root = Root(full_system=False, system=system) m5.instantiate() system.set_bsp_mode() system.create_pop_count_directory(64) - system.create_pr_workload(alpha) + system.create_bc_workload(init_addr, init_value) if sample: while True: exit_event = m5.simulate(100000000) diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py index ab5de485b1..97f1b5dc21 100644 --- a/configs/accl/bfs.py +++ b/configs/accl/bfs.py @@ -34,10 +34,19 @@ def get_inputs(): argparser = argparse.ArgumentParser() argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) argparser.add_argument("cache_size", type=str) argparser.add_argument("graph", type=str) argparser.add_argument("init_addr", type=int) argparser.add_argument("init_value", type=int) + argparser.add_argument( + "--visited", + dest="visited", + action="store_const", + const=True, + default=False, + help="Use visitation version of BFS", + ) argparser.add_argument( "--simple", dest="simple", @@ -67,10 +76,12 @@ def get_inputs(): return ( args.num_gpts, + args.num_registers, args.cache_size, args.graph, args.init_addr, args.init_value, + args.visited, args.simple, args.sample, args.verify, @@ -80,10 +91,12 @@ def get_inputs(): if __name__ == "__m5_main__": ( num_gpts, + num_registers, cache_size, graph, init_addr, init_value, + visited, simple, sample, verify, @@ -93,14 +106,17 @@ def get_inputs(): from sega_simple import SEGA else: from sega import SEGA - system = SEGA(num_gpts, cache_size, graph) + system = SEGA(num_gpts, num_registers, cache_size, graph) root = Root(full_system=False, system=system) m5.instantiate() system.set_async_mode() system.create_pop_count_directory(64) - system.create_bfs_workload(init_addr, init_value) + if visited: + system.create_bfs_visited_workload(init_addr, init_value) + else: + system.create_bfs_workload(init_addr, init_value) if sample: while True: exit_event = m5.simulate(100000000) diff --git a/configs/accl/sega-ddr/cc.py b/configs/accl/cc.py similarity index 100% rename from configs/accl/sega-ddr/cc.py rename to configs/accl/cc.py diff --git a/configs/accl/sega-ddr/bfs.py b/configs/accl/sega-ddr/bfs.py deleted file mode 100644 index 97f1b5dc21..0000000000 --- a/configs/accl/sega-ddr/bfs.py +++ /dev/null @@ -1,138 +0,0 @@ -# Copyright (c) 2022 The Regents of the University of California -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -import m5 -import argparse - -from m5.objects import * - - -def get_inputs(): - argparser = argparse.ArgumentParser() - argparser.add_argument("num_gpts", type=int) - argparser.add_argument("num_registers", type=int) - argparser.add_argument("cache_size", type=str) - argparser.add_argument("graph", type=str) - argparser.add_argument("init_addr", type=int) - argparser.add_argument("init_value", type=int) - argparser.add_argument( - "--visited", - dest="visited", - action="store_const", - const=True, - default=False, - help="Use visitation version of BFS", - ) - argparser.add_argument( - "--simple", - dest="simple", - action="store_const", - const=True, - default=False, - help="Use simple memory for vertex", - ) - argparser.add_argument( - "--sample", - dest="sample", - action="store_const", - const=True, - default=False, - help="Sample sim stats every 100us", - ) - argparser.add_argument( - "--verify", - dest="verify", - action="store_const", - const=True, - default=False, - help="Print final answer", - ) - - args = argparser.parse_args() - - return ( - args.num_gpts, - args.num_registers, - args.cache_size, - args.graph, - args.init_addr, - args.init_value, - args.visited, - args.simple, - args.sample, - args.verify, - ) - - -if __name__ == "__m5_main__": - ( - num_gpts, - num_registers, - cache_size, - graph, - init_addr, - init_value, - visited, - simple, - sample, - verify, - ) = get_inputs() - - if simple: - from sega_simple import SEGA - else: - from sega import SEGA - system = SEGA(num_gpts, num_registers, cache_size, graph) - root = Root(full_system=False, system=system) - - m5.instantiate() - - system.set_async_mode() - system.create_pop_count_directory(64) - if visited: - system.create_bfs_visited_workload(init_addr, init_value) - else: - system.create_bfs_workload(init_addr, init_value) - if sample: - while True: - exit_event = m5.simulate(100000000) - print( - f"Exited simulation at tick {m5.curTick()} " - + f"because {exit_event.getCause()}" - ) - m5.stats.dump() - m5.stats.reset() - if exit_event.getCause() != "simulate() limit reached": - break - else: - exit_event = m5.simulate() - print( - f"Exited simulation at tick {m5.curTick()} " - + f"because {exit_event.getCause()}" - ) - if verify: - system.print_answer() diff --git a/configs/accl/sega-ddr/sega.py b/configs/accl/sega-ddr/sega.py deleted file mode 100644 index 8325cf7565..0000000000 --- a/configs/accl/sega-ddr/sega.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright (c) 2022 The Regents of the University of California -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer; -# redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution; -# neither the name of the copyright holders nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from math import log -from m5.objects import * - - -def interleave_addresses(plain_range, num_channels, cache_line_size): - intlv_low_bit = log(cache_line_size, 2) - intlv_bits = log(num_channels, 2) - ret = [] - for i in range(num_channels): - ret.append( - AddrRange( - start=plain_range.start, - size=plain_range.size(), - intlvHighBit=intlv_low_bit + intlv_bits - 1, - xorHighBit=0, - intlvBits=intlv_bits, - intlvMatch=i, - ) - ) - return ret, intlv_low_bit + intlv_bits - 1 - - -class GPT(SubSystem): - def __init__(self, register_file_size: int, cache_size: str): - super().__init__() - self.wl_engine = WLEngine( - update_queue_size=64, register_file_size=register_file_size - ) - self.coalesce_engine = CoalesceEngine( - attached_memory_atom_size=32, - cache_size=cache_size, - max_resp_per_cycle=8, - pending_pull_limit=64, - active_buffer_size=80, - post_push_wb_queue_size=64, - ) - self.push_engine = PushEngine( - push_req_queue_size=32, - attached_memory_atom_size=64, - resp_queue_size=4096, - max_propagates_per_cycle=8, - update_queue_size=32, - ) - - self.vertex_mem_ctrl = HBMCtrl( - dram=HBM_2000_4H_1x64( - page_policy="close", read_buffer_size=96, write_buffer_size=96 - ), - dram_2=HBM_2000_4H_1x64( - page_policy="close", read_buffer_size=96, write_buffer_size=96 - ), - ) - self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port - - self.mpu = MPU( - wl_engine=self.wl_engine, - coalesce_engine=self.coalesce_engine, - push_engine=self.push_engine, - ) - - def getRespPort(self): - return self.wl_engine.in_ports - - def setRespPort(self, port): - self.wl_engine.in_ports = port - - def getReqPort(self): - return self.push_engine.out_ports - - def setReqPort(self, port): - self.push_engine.out_ports = port - - def getEdgeMemPort(self): - return self.push_engine.mem_port - - def setEdgeMemPort(self, port): - self.push_engine.mem_port = port - - def set_vertex_range(self, vertex_ranges): - self.vertex_mem_ctrl.dram.range = vertex_ranges[0] - self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1] - - def set_vertex_pch_bit(self, pch_bit): - self.vertex_mem_ctrl.pch_bit = pch_bit - - -class EdgeMemory(SubSystem): - def __init__(self, size: str): - super(EdgeMemory, self).__init__() - self.clk_domain = SrcClockDomain() - self.clk_domain.clock = "2.4GHz" - self.clk_domain.voltage_domain = VoltageDomain() - - self.mem_ctrl = MemCtrl( - dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False) - ) - self.xbar = NoncoherentXBar( - width=64, frontend_latency=1, forward_latency=1, response_latency=1 - ) - self.xbar.mem_side_ports = self.mem_ctrl.port - - def set_image(self, image): - self.mem_ctrl.dram.image_file = image - - def getPort(self): - return self.xbar.cpu_side_ports - - def setPort(self, port): - self.xbar.cpu_side_ports = port - -class SEGA(System): - def __init__(self, num_gpts, num_registers, cache_size, graph_path): - super(SEGA, self).__init__() - # num_gpts should be an even power of 2 - assert num_gpts != 0 - assert num_gpts % 2 == 0 - assert (num_gpts & (num_gpts - 1)) == 0 - - self.clk_domain = SrcClockDomain() - self.clk_domain.clock = "2GHz" - self.clk_domain.voltage_domain = VoltageDomain() - self.cache_line_size = 32 - self.mem_mode = "timing" - - # Building the CenteralController - self.ctrl = CenteralController(vertex_image_file=f"{graph_path}/vertices") - # Building the EdgeMemories - edge_mem = [] - for i in range(int(num_gpts/2)): - mem = EdgeMemory("16GiB") - mem.set_image(f"{graph_path}/edgelist_{i}") - edge_mem.append(mem) - self.edge_mem = edge_mem - # Building the GPTs - vertex_ranges, pch_bit = interleave_addresses( - AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32 - ) - gpts = [] - for i in range(num_gpts): - gpt = GPT(num_registers, cache_size) - gpt.set_vertex_range( - [vertex_ranges[i], vertex_ranges[i + num_gpts]] - ) - gpt.set_vertex_pch_bit(pch_bit) - gpt.setEdgeMemPort(self.edge_mem[i % (int(num_gpts/2))].getPort()) - gpts.append(gpt) - # Creating the interconnect among mpus - for gpt_0 in gpts: - for gpt_1 in gpts: - gpt_0.setReqPort(gpt_1.getRespPort()) - self.gpts = gpts - - self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts] - - def work_count(self): - return self.ctrl.workCount() - - def set_async_mode(self): - self.ctrl.setAsyncMode() - - def set_bsp_mode(self): - self.ctrl.setBSPMode() - - def create_pop_count_directory(self, atoms_per_block): - self.ctrl.createPopCountDirectory(atoms_per_block) - - def create_bfs_workload(self, init_addr, init_value): - self.ctrl.createBFSWorkload(init_addr, init_value) - - def create_bfs_visited_workload(self, init_addr, init_value): - self.ctrl.createBFSVisitedWorkload(init_addr, init_value) - - def create_sssp_workload(self, init_addr, init_value): - self.ctrl.createSSSPWorkload(init_addr, init_value) - - def create_cc_workload(self): - self.ctrl.createCCWorkload() - - def create_pr_workload(self, alpha): - self.ctrl.createPRWorkload(alpha) - - def print_answer(self): - self.ctrl.printAnswerToHostSimout() diff --git a/configs/accl/sega.py b/configs/accl/sega.py index b5ce618f7f..32124731d6 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -47,16 +47,17 @@ def interleave_addresses(plain_range, num_channels, cache_line_size): class GPT(SubSystem): - def __init__( - self, edge_memory_size: str, cache_size: str): + def __init__(self, register_file_size: int, cache_size: str): super().__init__() - self.wl_engine = WLEngine(update_queue_size=64, register_file_size=64) + self.wl_engine = WLEngine( + update_queue_size=64, register_file_size=register_file_size + ) self.coalesce_engine = CoalesceEngine( attached_memory_atom_size=32, cache_size=cache_size, max_resp_per_cycle=8, - pending_pull_limit=32, - active_buffer_size=64, + pending_pull_limit=64, + active_buffer_size=80, post_push_wb_queue_size=64, ) self.push_engine = PushEngine( @@ -68,16 +69,14 @@ def __init__( ) self.vertex_mem_ctrl = HBMCtrl( - dram=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96), - dram_2=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96) - ) - - self.edge_mem_ctrl = MemCtrl( - dram= - DDR4_2400_8x8(range=AddrRange(edge_memory_size), in_addr_map=False) + dram=HBM_2000_4H_1x64( + page_policy="close", read_buffer_size=96, write_buffer_size=96 + ), + dram_2=HBM_2000_4H_1x64( + page_policy="close", read_buffer_size=96, write_buffer_size=96 + ), ) self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port - self.push_engine.mem_port = self.edge_mem_ctrl.port self.mpu = MPU( wl_engine=self.wl_engine, @@ -97,6 +96,12 @@ def getReqPort(self): def setReqPort(self, port): self.push_engine.out_ports = port + def getEdgeMemPort(self): + return self.push_engine.mem_port + + def setEdgeMemPort(self, port): + self.push_engine.mem_port = port + def set_vertex_range(self, vertex_ranges): self.vertex_mem_ctrl.dram.range = vertex_ranges[0] self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1] @@ -104,32 +109,65 @@ def set_vertex_range(self, vertex_ranges): def set_vertex_pch_bit(self, pch_bit): self.vertex_mem_ctrl.pch_bit = pch_bit - def set_edge_image(self, edge_image): - self.edge_mem_ctrl.dram.image_file = edge_image +class EdgeMemory(SubSystem): + def __init__(self, size: str): + super(EdgeMemory, self).__init__() + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2.4GHz" + self.clk_domain.voltage_domain = VoltageDomain() + + self.mem_ctrl = MemCtrl( + dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False) + ) + self.xbar = NoncoherentXBar( + width=64, frontend_latency=1, forward_latency=1, response_latency=1 + ) + self.xbar.mem_side_ports = self.mem_ctrl.port + + def set_image(self, image): + self.mem_ctrl.dram.image_file = image + + def getPort(self): + return self.xbar.cpu_side_ports + + def setPort(self, port): + self.xbar.cpu_side_ports = port class SEGA(System): - def __init__(self, num_mpus, cache_size, graph_path): + def __init__(self, num_gpts, num_registers, cache_size, graph_path): super(SEGA, self).__init__() + # num_gpts should be an even power of 2 + assert num_gpts != 0 + assert num_gpts % 2 == 0 + assert (num_gpts & (num_gpts - 1)) == 0 + self.clk_domain = SrcClockDomain() self.clk_domain.clock = "2GHz" self.clk_domain.voltage_domain = VoltageDomain() self.cache_line_size = 32 self.mem_mode = "timing" - self.ctrl = CenteralController(image_file=f"{graph_path}/vertices") - + # Building the CenteralController + self.ctrl = CenteralController(vertex_image_file=f"{graph_path}/vertices") + # Building the EdgeMemories + edge_mem = [] + for i in range(int(num_gpts/2)): + mem = EdgeMemory("16GiB") + mem.set_image(f"{graph_path}/edgelist_{i}") + edge_mem.append(mem) + self.edge_mem = edge_mem + # Building the GPTs vertex_ranges, pch_bit = interleave_addresses( - AddrRange(start=0, size="4GiB"), 2 * num_mpus, 32 + AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32 ) - gpts = [] - for i in range(num_mpus): - gpt = GPT("2GiB", cache_size) + for i in range(num_gpts): + gpt = GPT(num_registers, cache_size) gpt.set_vertex_range( - [vertex_ranges[i], vertex_ranges[i + num_mpus]] + [vertex_ranges[i], vertex_ranges[i + num_gpts]] ) gpt.set_vertex_pch_bit(pch_bit) - gpt.set_edge_image(f"{graph_path}/edgelist_{i}") + gpt.setEdgeMemPort(self.edge_mem[i % (int(num_gpts/2))].getPort()) gpts.append(gpt) # Creating the interconnect among mpus for gpt_0 in gpts: @@ -154,8 +192,20 @@ def create_pop_count_directory(self, atoms_per_block): def create_bfs_workload(self, init_addr, init_value): self.ctrl.createBFSWorkload(init_addr, init_value) + def create_bfs_visited_workload(self, init_addr, init_value): + self.ctrl.createBFSVisitedWorkload(init_addr, init_value) + + def create_sssp_workload(self, init_addr, init_value): + self.ctrl.createSSSPWorkload(init_addr, init_value) + + def create_cc_workload(self): + self.ctrl.createCCWorkload() + def create_pr_workload(self, alpha): self.ctrl.createPRWorkload(alpha) + def create_bc_workload(self, init_addr, init_value): + self.ctrl.createBCWorkload(init_addr, init_value) + def print_answer(self): self.ctrl.printAnswerToHostSimout() diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py index ff97134b47..ff567b57e3 100644 --- a/configs/accl/sega_simple.py +++ b/configs/accl/sega_simple.py @@ -47,16 +47,17 @@ def interleave_addresses(plain_range, num_channels, cache_line_size): class GPT(SubSystem): - def __init__( - self, edge_memory_size: str, cache_size: str): + def __init__(self, register_file_size: int, cache_size: str): super().__init__() - self.wl_engine = WLEngine(update_queue_size=64, register_file_size=64) + self.wl_engine = WLEngine( + update_queue_size=64, register_file_size=register_file_size + ) self.coalesce_engine = CoalesceEngine( attached_memory_atom_size=32, cache_size=cache_size, max_resp_per_cycle=8, - pending_pull_limit=32, - active_buffer_size=64, + pending_pull_limit=64, + active_buffer_size=80, post_push_wb_queue_size=64, ) self.push_engine = PushEngine( @@ -67,14 +68,10 @@ def __init__( update_queue_size=32, ) - self.vertex_mem_ctrl = SimpleMemory(latency="122ns", latency_var="0ns", bandwidth="28GiB/s") - - self.edge_mem_ctrl = MemCtrl( - dram=DDR4_2400_8x8( - range=AddrRange(edge_memory_size), in_addr_map=False) + self.vertex_mem_ctrl = SimpleMemory( + latency="122ns", latency_var="0ns", bandwidth="28GiB/s" ) self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port - self.push_engine.mem_port = self.edge_mem_ctrl.port self.mpu = MPU( wl_engine=self.wl_engine, @@ -94,32 +91,77 @@ def getReqPort(self): def setReqPort(self, port): self.push_engine.out_ports = port + def getEdgeMemPort(self): + return self.push_engine.mem_port + + def setEdgeMemPort(self, port): + self.push_engine.mem_port = port + def set_vertex_range(self, vertex_range): self.vertex_mem_ctrl.range = vertex_range - def set_edge_image(self, edge_image): - self.edge_mem_ctrl.dram.image_file = edge_image + +class EdgeMemory(SubSystem): + def __init__(self, size: str): + super(EdgeMemory, self).__init__() + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2.4GHz" + self.clk_domain.voltage_domain = VoltageDomain() + + self.mem_ctrl = MemCtrl( + dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False) + ) + self.xbar = NoncoherentXBar( + width=64, frontend_latency=1, forward_latency=1, response_latency=1 + ) + self.xbar.mem_side_ports = self.mem_ctrl.port + + def set_image(self, image): + self.mem_ctrl.dram.image_file = image + + def getPort(self): + return self.xbar.cpu_side_ports + + def setPort(self, port): + self.xbar.cpu_side_ports = port + class SEGA(System): - def __init__(self, num_mpus, cache_size, graph_path): + def __init__(self, num_gpts, num_registers, cache_size, graph_path): super(SEGA, self).__init__() + # num_gpts should be an even power of 2 + assert num_gpts != 0 + assert num_gpts % 2 == 0 + assert (num_gpts & (num_gpts - 1)) == 0 + self.clk_domain = SrcClockDomain() self.clk_domain.clock = "2GHz" self.clk_domain.voltage_domain = VoltageDomain() self.cache_line_size = 32 self.mem_mode = "timing" - self.ctrl = CenteralController(image_file=f"{graph_path}/vertices") - + # Building the CenteralController + self.ctrl = CenteralController( + vertex_image_file=f"{graph_path}/vertices" + ) + # Building the EdgeMemories + edge_mem = [] + for i in range(int(num_gpts / 2)): + mem = EdgeMemory("16GiB") + mem.set_image(f"{graph_path}/edgelist_{i}") + edge_mem.append(mem) + self.edge_mem = edge_mem + # Building the GPTs vertex_ranges = interleave_addresses( - AddrRange(start=0, size="4GiB"), num_mpus, 32 + AddrRange(start=0, size="4GiB"), num_gpts, 32 ) - gpts = [] - for i in range(num_mpus): - gpt = GPT("2GiB", cache_size) + for i in range(num_gpts): + gpt = GPT(num_registers, cache_size) gpt.set_vertex_range(vertex_ranges[i]) - gpt.set_edge_image(f"{graph_path}/edgelist_{i}") + gpt.setEdgeMemPort( + self.edge_mem[i % (int(num_gpts / 2))].getPort() + ) gpts.append(gpt) # Creating the interconnect among mpus for gpt_0 in gpts: @@ -144,8 +186,20 @@ def create_pop_count_directory(self, atoms_per_block): def create_bfs_workload(self, init_addr, init_value): self.ctrl.createBFSWorkload(init_addr, init_value) + def create_bfs_visited_workload(self, init_addr, init_value): + self.ctrl.createBFSVisitedWorkload(init_addr, init_value) + + def create_sssp_workload(self, init_addr, init_value): + self.ctrl.createSSSPWorkload(init_addr, init_value) + + def create_cc_workload(self): + self.ctrl.createCCWorkload() + def create_pr_workload(self, alpha): self.ctrl.createPRWorkload(alpha) + def create_bc_workload(self, init_addr, init_value): + self.ctrl.createBCWorkload(init_addr, init_value) + def print_answer(self): self.ctrl.printAnswerToHostSimout() diff --git a/configs/accl/sega-ddr/sssp.py b/configs/accl/sssp.py similarity index 100% rename from configs/accl/sega-ddr/sssp.py rename to configs/accl/sssp.py diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 6ac2018629..7bcd447b8e 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -245,7 +245,7 @@ BSPBCWorkload::init(PacketPtr pkt, WorkDirectory* dir) uint32_t prop = 0; prop |= initValue; // NOTE: Depth of the initial vertex is 0. - prop &= (4294967295U >> 8); + prop &= countMask; new_wl.tempProp = prop; new_wl.prop = prop; if (activeCondition(new_wl, items[index])) { @@ -265,11 +265,10 @@ BSPBCWorkload::reduce(uint32_t update, uint32_t value) { uint32_t update_depth = (update & depthMask) >> 24; uint32_t update_count = (update & countMask); - assert(update_depth == (currentDepth - 1)); uint32_t value_depth = (value & depthMask) >> 24; uint32_t value_count = (value & countMask); if (value_depth == 255) { - value_depth = update_depth; + value_depth = currentDepth; value_count = 0; } if (value_depth == currentDepth) { @@ -283,7 +282,7 @@ BSPBCWorkload::reduce(uint32_t update, uint32_t value) " Therefore, performane metrics could be used."); // HACK: Make sure to always set the depth correctly even if count // exceeds the 2^24-1 limit. Here we reset the depth section of ret. - ret &= (4294967295U >> 8); + ret &= countMask; // NOTE: Now that the depth is securely reset we can copy the correct value. ret |= (value_depth << 24); return ret; @@ -311,7 +310,7 @@ bool BSPBCWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) { uint32_t depth = (new_wl.tempProp & depthMask) >> 24; - return (depth == currentDepth); + return (depth == currentDepth) && (new_wl.degree > 0); } std::string diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh index 4ed3dcf3ac..5a55ad4cdc 100644 --- a/src/accl/graph/base/graph_workload.hh +++ b/src/accl/graph/base/graph_workload.hh @@ -137,8 +137,8 @@ class BSPBCWorkload : public GraphWorkload uint32_t countMask; public: BSPBCWorkload(Addr init_addr, uint32_t init_value): - currentDepth(0), initAddr(init_addr), initValue(init_value), - depthMask(4278190080), countMask(16777215) + initAddr(init_addr), initValue(init_value), + currentDepth(0), depthMask(4278190080), countMask(16777215) {} ~BSPBCWorkload() {} diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index f3210a8ec3..7e16b7e7de 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -50,6 +50,7 @@ class CenteralController(ClockedObject): PyBindMethod("createSSSPWorkload"), PyBindMethod("createCCWorkload"), PyBindMethod("createPRWorkload"), + PyBindMethod("createBCWorkload"), PyBindMethod("workCount"), PyBindMethod("printAnswerToHostSimout") ] diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 15062f1465..86b9ea2b02 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -81,6 +81,12 @@ CenteralController::createPRWorkload(float alpha) workload = new BSPPRWorkload(alpha); } +void +CenteralController::createBCWorkload(Addr init_addr, uint32_t init_value) +{ + workload = new BSPBCWorkload(init_addr, init_value); +} + void CenteralController::createPopCountDirectory(int atoms_per_block) { @@ -131,17 +137,13 @@ CenteralController::startup() panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image."); - // IDEA: Should this be here or after calling start? - // Point of iterate here is to set global variables. - // At this point, we know that vertex memory has been - // initialized and we can initialize global variables. - workload->iterate(); for (auto mpu: mpuVector) { mpu->postMemInitSetup(); if (!mpu->running() && (mpu->workCount() > 0)) { mpu->start(); } } + workload->iterate(); } PacketPtr @@ -175,15 +177,11 @@ CenteralController::recvDoneSignal() for (auto mpu: mpuVector) { mpu->postConsumeProcess(); mpu->swapDirectories(); - // IDEA: Should this be here or after calling start? - // Point of iterate here is to update global variables. - // At this point, we know that vertex memory has been - // updated and we can update global variables. - workload->iterate(); if (!mpu->running() && (mpu->workCount() > 0)) { mpu->start(); } } + workload->iterate(); exitSimLoopNow("finished an iteration."); } } diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index aa3938353d..ba829061b5 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -74,6 +74,7 @@ class CenteralController : public ClockedObject void createSSSPWorkload(Addr init_addr, uint32_t init_value); void createCCWorkload(); void createPRWorkload(float alpha); + void createBCWorkload(Addr init_addr, uint32_t init_value); void recvDoneSignal(); From 787f7f4f45ffeb9e312f4a9000f58742552b555d Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Mon, 14 Nov 2022 21:03:16 -0800 Subject: [PATCH 241/247] Fixing BC run script. --- configs/accl/bc.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/configs/accl/bc.py b/configs/accl/bc.py index 074bee73b9..56faeb3e4d 100644 --- a/configs/accl/bc.py +++ b/configs/accl/bc.py @@ -37,7 +37,6 @@ def get_inputs(): argparser.add_argument("num_registers", type=int) argparser.add_argument("cache_size", type=str) argparser.add_argument("graph", type=str) - argparser.add_argument("iterations", type=int) argparser.add_argument("init_addr", type=int) argparser.add_argument("init_value", type=int) argparser.add_argument( @@ -72,7 +71,6 @@ def get_inputs(): args.num_registers, args.cache_size, args.graph, - args.iterations, args.init_addr, args.init_value, args.simple, @@ -87,7 +85,6 @@ def get_inputs(): num_registers, cache_size, graph, - iterations, init_addr, init_value, simple, @@ -119,16 +116,16 @@ def get_inputs(): if exit_event.getCause() != "simulate() limit reached": break else: - iteration = 0 - while iteration < iterations: + iterations = 0 + while True: exit_event = m5.simulate() print( f"Exited simulation at tick {m5.curTick()} " + f"because {exit_event.getCause()}" ) - iteration += 1 + iterations += 1 if system.work_count() == 0: break - print(f"#iterations: {iteration}") + print(f"#iterations: {iterations}") if verify: system.print_answer() From b13d005fcb65f7d9e6d97ecc6285044055efa7d7 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Wed, 16 Nov 2022 22:54:39 -0800 Subject: [PATCH 242/247] Fixing dirty issue in bsp. --- configs/accl/sega.py | 2 +- configs/accl/sega_simple.py | 2 +- src/accl/graph/sega/coalesce_engine.cc | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 32124731d6..672151ceed 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -152,7 +152,7 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path): # Building the EdgeMemories edge_mem = [] for i in range(int(num_gpts/2)): - mem = EdgeMemory("16GiB") + mem = EdgeMemory("4GiB") mem.set_image(f"{graph_path}/edgelist_{i}") edge_mem.append(mem) self.edge_mem = edge_mem diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py index ff567b57e3..06908d08d3 100644 --- a/configs/accl/sega_simple.py +++ b/configs/accl/sega_simple.py @@ -147,7 +147,7 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path): # Building the EdgeMemories edge_mem = [] for i in range(int(num_gpts / 2)): - mem = EdgeMemory("16GiB") + mem = EdgeMemory("4GiB") mem.set_image(f"{graph_path}/edgelist_{i}") edge_mem.append(mem) self.edge_mem = edge_mem diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 4fa400a63a..a2d4378377 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -114,7 +114,6 @@ CoalesceEngine::postMemInitSetup() void CoalesceEngine::postConsumeProcess() { - WorkListItem items[numElementsPerLine]; Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr); for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) { Addr addr = peerMemoryRange.addIntlvBits(local_addr); @@ -133,6 +132,7 @@ CoalesceEngine::postConsumeProcess() if (cacheBlocks[block_index].items[index].activeFuture) { cacheBlocks[block_index].items[index].activeFuture = false; cacheBlocks[block_index].items[index].activeNow = true; + cacheBlocks[block_index].dirty = true; } } if (!atom_active_future_before && atom_active_future_after) { @@ -142,10 +142,10 @@ CoalesceEngine::postConsumeProcess() futureActiveCacheBlocks.erase(block_index); } } else { + WorkListItem items[numElementsPerLine]; PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize); memPort.sendFunctional(read_pkt); read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); - delete read_pkt; bool atom_active_future_before = false; bool atom_active_future_after = false; for (int index = 0; index < numElementsPerLine; index++) { @@ -166,6 +166,7 @@ CoalesceEngine::postConsumeProcess() } PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items); memPort.sendFunctional(write_pkt); + delete read_pkt; delete write_pkt; } } From 7861b6a29700aaaf606a6f4b5a47611aea086c87 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 17 Nov 2022 19:26:29 -0800 Subject: [PATCH 243/247] Adding Async PR. --- configs/accl/async-pr.py | 125 +++++++++++++++++++++ configs/accl/pr.py | 6 +- configs/accl/sega.py | 6 + configs/accl/sega_simple.py | 3 + src/accl/graph/base/graph_workload.cc | 78 +++++++++++++ src/accl/graph/base/graph_workload.hh | 30 ++++- src/accl/graph/sega/CenteralController.py | 2 + src/accl/graph/sega/centeral_controller.cc | 13 +++ src/accl/graph/sega/centeral_controller.hh | 2 + src/accl/graph/sega/coalesce_engine.cc | 23 +++- src/accl/graph/sega/coalesce_engine.hh | 3 + src/accl/graph/sega/wl_engine.cc | 9 ++ src/accl/graph/sega/wl_engine.hh | 3 + 13 files changed, 294 insertions(+), 9 deletions(-) create mode 100644 configs/accl/async-pr.py diff --git a/configs/accl/async-pr.py b/configs/accl/async-pr.py new file mode 100644 index 0000000000..0bfb6caeaa --- /dev/null +++ b/configs/accl/async-pr.py @@ -0,0 +1,125 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("alpha", type=float) + argparser.add_argument("threshold", type=float) + argparser.add_argument( + "--simple", + dest="simple", + action="store_const", + const=True, + default=False, + help="Use simple memory for vertex", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.num_registers, + args.cache_size, + args.graph, + args.alpha, + args.threshold, + args.simple, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + num_registers, + cache_size, + graph, + alpha, + threshold, + simple, + sample, + verify, + ) = get_inputs() + + if simple: + from sega_simple import SEGA + else: + from sega import SEGA + system = SEGA(num_gpts, num_registers, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_async_mode() + system.create_pop_count_directory(64) + system.create_async_pr_workload(alpha, threshold) + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if verify: + system.print_answer() diff --git a/configs/accl/pr.py b/configs/accl/pr.py index ea8a103640..42ae46ea78 100644 --- a/configs/accl/pr.py +++ b/configs/accl/pr.py @@ -34,6 +34,7 @@ def get_inputs(): argparser = argparse.ArgumentParser() argparser.add_argument("num_gpts", type=int) + argparser.add_argument("num_registers", type=int) argparser.add_argument("cache_size", type=str) argparser.add_argument("iterations", type=int) argparser.add_argument("graph", type=str) @@ -67,6 +68,7 @@ def get_inputs(): return ( args.num_gpts, + args.num_registers, args.cache_size, args.graph, args.iterations, @@ -80,6 +82,7 @@ def get_inputs(): if __name__ == "__m5_main__": ( num_gpts, + num_registers, cache_size, graph, iterations, @@ -93,7 +96,7 @@ def get_inputs(): from sega_simple import SEGA else: from sega import SEGA - system = SEGA(num_gpts, cache_size, graph) + system = SEGA(num_gpts, num_registers, cache_size, graph) root = Root(full_system=False, system=system) m5.instantiate() @@ -121,6 +124,7 @@ def get_inputs(): + f"because {exit_event.getCause()}" ) iteration += 1 + print(f"error: {system.get_pr_error()}") if system.work_count() == 0: break print(f"#iterations: {iteration}") diff --git a/configs/accl/sega.py b/configs/accl/sega.py index 672151ceed..ef23575b9b 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -201,9 +201,15 @@ def create_sssp_workload(self, init_addr, init_value): def create_cc_workload(self): self.ctrl.createCCWorkload() + def create_async_pr_workload(self, alpha, threshold): + self.ctrl.createAsyncPRWorkload(alpha, threshold) + def create_pr_workload(self, alpha): self.ctrl.createPRWorkload(alpha) + def get_pr_error(self): + return self.ctrl.getPRError() + def create_bc_workload(self, init_addr, init_value): self.ctrl.createBCWorkload(init_addr, init_value) diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py index 06908d08d3..d6ae8772a5 100644 --- a/configs/accl/sega_simple.py +++ b/configs/accl/sega_simple.py @@ -195,6 +195,9 @@ def create_sssp_workload(self, init_addr, init_value): def create_cc_workload(self): self.ctrl.createCCWorkload() + def create_async_pr_workload(self, alpha, threshold): + self.ctrl.createAsyncPRWorkload(alpha, threshold) + def create_pr_workload(self, alpha): self.ctrl.createPRWorkload(alpha) diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 7bcd447b8e..3a401f0963 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -154,6 +154,81 @@ SSSPWorkload::propagate(uint32_t value, uint32_t weight) return value + weight; } +void +PRWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + int num_elements = pkt->getSize() / sizeof(WorkListItem); + WorkListItem items[num_elements]; + pkt->writeDataToBlock((uint8_t*) items, pkt->getSize()); + + bool atom_active = false; + for (int index = 0; index < num_elements; index++) { + WorkListItem new_wl = items[index]; + new_wl.tempProp = readFromFloat(0); + new_wl.prop = readFromFloat(1 - alpha); + atom_active |= activeCondition(new_wl, items[index]); + items[index] = new_wl; + } + if (atom_active) { + dir->activate(pkt->getAddr()); + } + + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt->getSize()); +} + +uint32_t +PRWorkload::reduce(uint32_t update, uint32_t value) +{ + float update_float = writeToFloat(update); + float value_float = writeToFloat(value); + return readFromFloat(update_float + value_float); +} + +uint32_t +PRWorkload::propagate(uint32_t value, uint32_t weight) +{ + float value_float = writeToFloat(value); + float weight_float = writeToFloat(weight); + if (weight == 0) { + weight_float = 1.0; + } + return readFromFloat(alpha * value_float * weight_float); +} + +bool +PRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) +{ + float temp_float = writeToFloat(new_wl.tempProp); + float prop_float = writeToFloat(new_wl.prop); + float dist = std::abs(temp_float - prop_float); + return (dist >= threshold) && (new_wl.degree > 0); +} + +uint32_t +PRWorkload::apply(WorkListItem& wl) +{ + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + float delta = (temp_float - prop_float) / wl.degree; + wl.prop = wl.tempProp; + return readFromFloat(delta); +} + +std::string +PRWorkload::printWorkListItem(const WorkListItem wl) +{ + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + return csprintf( + "WorkListItem{tempProp: %f, prop: %f, degree: %u, " + "edgeIndex: %u, activeNow: %s, activeFuture: %s}", + temp_float, prop_float, wl.degree, wl.edgeIndex, + wl.activeNow ? "true" : "false", + wl.activeFuture ? "true" : "false"); +} + void BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir) { @@ -212,6 +287,9 @@ BSPPRWorkload::apply(WorkListItem& wl) void BSPPRWorkload::interIterationInit(WorkListItem& wl) { + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + error += std::abs(temp_float - prop_float); wl.prop = wl.tempProp; wl.tempProp = readFromFloat(1 - alpha); wl.activeFuture = (wl.degree > 0); diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh index 5a55ad4cdc..d42bfd0f26 100644 --- a/src/accl/graph/base/graph_workload.hh +++ b/src/accl/graph/base/graph_workload.hh @@ -105,13 +105,37 @@ class SSSPWorkload : public BFSWorkload virtual uint32_t propagate(uint32_t value, uint32_t weight) override; }; +class PRWorkload : public GraphWorkload +{ + private: + float alpha; + float threshold; + + public: + PRWorkload(float alpha, float threshold): + alpha(alpha), threshold(threshold) + {} + + ~PRWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual void iterate() {} + virtual void interIterationInit(WorkListItem& wl) {}; + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); + virtual std::string printWorkListItem(const WorkListItem wl); +}; + class BSPPRWorkload : public GraphWorkload { private: float alpha; + float error; public: - BSPPRWorkload(float alpha): alpha(alpha) {} + BSPPRWorkload(float alpha): alpha(alpha), error(0) {} ~BSPPRWorkload() {} @@ -119,10 +143,12 @@ class BSPPRWorkload : public GraphWorkload virtual uint32_t reduce(uint32_t update, uint32_t value); virtual uint32_t propagate(uint32_t value, uint32_t weight); virtual uint32_t apply(WorkListItem& wl); - virtual void iterate() {} + virtual void iterate() { error = 0; } virtual void interIterationInit(WorkListItem& wl); virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); virtual std::string printWorkListItem(const WorkListItem wl); + + float getError() { return error; } }; class BSPBCWorkload : public GraphWorkload diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py index 7e16b7e7de..c5f44c82e9 100644 --- a/src/accl/graph/sega/CenteralController.py +++ b/src/accl/graph/sega/CenteralController.py @@ -49,8 +49,10 @@ class CenteralController(ClockedObject): PyBindMethod("createBFSVisitedWorkload"), PyBindMethod("createSSSPWorkload"), PyBindMethod("createCCWorkload"), + PyBindMethod("createAsyncPRWorkload"), PyBindMethod("createPRWorkload"), PyBindMethod("createBCWorkload"), PyBindMethod("workCount"), + PyBindMethod("getPRError"), PyBindMethod("printAnswerToHostSimout") ] diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 86b9ea2b02..23eb6bbc0e 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -75,6 +75,12 @@ CenteralController::createCCWorkload() workload = new CCWorkload(); } +void +CenteralController::createAsyncPRWorkload(float alpha, float threshold) +{ + workload = new PRWorkload(alpha, threshold); +} + void CenteralController::createPRWorkload(float alpha) { @@ -196,6 +202,13 @@ CenteralController::workCount() return work_count; } +float +CenteralController::getPRError() +{ + BSPPRWorkload* pr_workload = dynamic_cast(workload); + return pr_workload->getError(); +} + void CenteralController::printAnswerToHostSimout() { diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index ba829061b5..e73ed22666 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -73,12 +73,14 @@ class CenteralController : public ClockedObject void createBFSVisitedWorkload(Addr init_addr, uint32_t init_value); void createSSSPWorkload(Addr init_addr, uint32_t init_value); void createCCWorkload(); + void createAsyncPRWorkload(float alpha, float threshold); void createPRWorkload(float alpha); void createBCWorkload(Addr init_addr, uint32_t init_value); void recvDoneSignal(); int workCount(); + float getPRError(); void printAnswerToHostSimout(); }; diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index a2d4378377..02c98ba640 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -59,6 +59,9 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): nextApplyEvent([this] { processNextApplyEvent(); }, name() + ".nextApplyEvent"), + nextDoneSignalEvent([this] { + processNextDoneSignalEvent(); + }, name() + ".nextDoneSignalEvent"), stats(*this) { assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine)); @@ -552,8 +555,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) } } - if (done()) { - owner->recvDoneSignal(); + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); } return true; } @@ -712,8 +715,9 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) block_index, cacheBlocks[block_index].to_string()); stats.numVertexWrites++; - if ((cacheBlocks[block_index].state == CacheState::IDLE) && done()) { - owner->recvDoneSignal(); + if ((cacheBlocks[block_index].state == CacheState::IDLE) && + done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); } } @@ -749,8 +753,8 @@ CoalesceEngine::processNextMemoryEvent() schedule(nextMemoryEvent, nextCycle()); } - if (done()) { - owner->recvDoneSignal(); + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); } } @@ -1170,6 +1174,13 @@ CoalesceEngine::processNextApplyEvent() } } +void +CoalesceEngine::processNextDoneSignalEvent() +{ + if (done()) { + owner->recvDoneSignal(); + } +} CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) : statistics::Group(&_coalesce), diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index 8ee17781fc..b6eec725f9 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -151,6 +151,9 @@ class CoalesceEngine : public BaseMemoryEngine EventFunctionWrapper nextApplyEvent; void processNextApplyEvent(); + EventFunctionWrapper nextDoneSignalEvent; + void processNextDoneSignalEvent(); + struct CoalesceStats : public statistics::Group { CoalesceStats(CoalesceEngine &coalesce); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index ed91622b43..d563450179 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -43,6 +43,7 @@ WLEngine::WLEngine(const WLEngineParams& params): registerFileSize(params.register_file_size), nextReadEvent([this]{ processNextReadEvent(); }, name()), nextReduceEvent([this]{ processNextReduceEvent(); }, name()), + nextDoneSignalEvent([this] { processNextDoneSignalEvent(); }, name()), stats(*this) { for (int i = 0; i < params.port_in_ports_connection_count; ++i) { @@ -316,6 +317,14 @@ WLEngine::processNextReduceEvent() } workListFile.clear(); + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } +} + +void +WLEngine::processNextDoneSignalEvent() +{ if (done()) { owner->recvDoneSignal(); } diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh index 45baaa1e79..fb147e692a 100644 --- a/src/accl/graph/sega/wl_engine.hh +++ b/src/accl/graph/sega/wl_engine.hh @@ -90,6 +90,9 @@ class WLEngine : public BaseReduceEngine EventFunctionWrapper nextReduceEvent; void processNextReduceEvent(); + EventFunctionWrapper nextDoneSignalEvent; + void processNextDoneSignalEvent(); + struct WorkListStats : public statistics::Group { WorkListStats(WLEngine &worklist); From a991328c22c7dfa6b1b1e03d6d18868c651c3c0e Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 17 Nov 2022 20:33:07 -0800 Subject: [PATCH 244/247] Fixing typos. --- configs/accl/pr.py | 14 ++++++++++++-- configs/accl/sega.py | 4 ++-- configs/accl/sega_simple.py | 4 ++-- src/accl/graph/base/graph_workload.cc | 6 +++--- src/accl/graph/base/graph_workload.hh | 10 +++++++--- src/accl/graph/sega/centeral_controller.cc | 4 ++-- src/accl/graph/sega/centeral_controller.hh | 2 +- 7 files changed, 29 insertions(+), 15 deletions(-) diff --git a/configs/accl/pr.py b/configs/accl/pr.py index 42ae46ea78..569514eb82 100644 --- a/configs/accl/pr.py +++ b/configs/accl/pr.py @@ -36,9 +36,11 @@ def get_inputs(): argparser.add_argument("num_gpts", type=int) argparser.add_argument("num_registers", type=int) argparser.add_argument("cache_size", type=str) - argparser.add_argument("iterations", type=int) argparser.add_argument("graph", type=str) + argparser.add_argument("iterations", type=int) argparser.add_argument("alpha", type=float) + argparser.add_argument("--num_nodes", type=int, default=1) + argparser.add_argument("--error_threshold", type=float, default=0.0) argparser.add_argument( "--simple", dest="simple", @@ -73,6 +75,8 @@ def get_inputs(): args.graph, args.iterations, args.alpha, + args.num_nodes, + args.error_threshold, args.simple, args.sample, args.verify, @@ -87,11 +91,15 @@ def get_inputs(): graph, iterations, alpha, + num_nodes, + error_threshold, simple, sample, verify, ) = get_inputs() + print(f"error_threshold: {error_threshold}") + if simple: from sega_simple import SEGA else: @@ -103,7 +111,7 @@ def get_inputs(): system.set_bsp_mode() system.create_pop_count_directory(64) - system.create_pr_workload(alpha) + system.create_pr_workload(num_nodes, alpha) if sample: while True: exit_event = m5.simulate(100000000) @@ -125,6 +133,8 @@ def get_inputs(): ) iteration += 1 print(f"error: {system.get_pr_error()}") + if system.get_pr_error() < error_threshold: + break if system.work_count() == 0: break print(f"#iterations: {iteration}") diff --git a/configs/accl/sega.py b/configs/accl/sega.py index ef23575b9b..32d0dd26ab 100644 --- a/configs/accl/sega.py +++ b/configs/accl/sega.py @@ -204,8 +204,8 @@ def create_cc_workload(self): def create_async_pr_workload(self, alpha, threshold): self.ctrl.createAsyncPRWorkload(alpha, threshold) - def create_pr_workload(self, alpha): - self.ctrl.createPRWorkload(alpha) + def create_pr_workload(self, num_nodes, alpha): + self.ctrl.createPRWorkload(num_nodes, alpha) def get_pr_error(self): return self.ctrl.getPRError() diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py index d6ae8772a5..2d36ec584d 100644 --- a/configs/accl/sega_simple.py +++ b/configs/accl/sega_simple.py @@ -198,8 +198,8 @@ def create_cc_workload(self): def create_async_pr_workload(self, alpha, threshold): self.ctrl.createAsyncPRWorkload(alpha, threshold) - def create_pr_workload(self, alpha): - self.ctrl.createPRWorkload(alpha) + def create_pr_workload(self, num_nodes, alpha): + self.ctrl.createPRWorkload(num_nodes, alpha) def create_bc_workload(self, init_addr, init_value): self.ctrl.createBCWorkload(init_addr, init_value) diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index 3a401f0963..ab58b02b73 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -240,8 +240,8 @@ BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir) bool atom_active = false; for (int i = 0; i < num_elements; i++) { WorkListItem new_wl = items[i]; - new_wl.tempProp = readFromFloat(1 - alpha); - new_wl.prop = readFromFloat(1); + new_wl.tempProp = readFromFloat((1 - alpha)/numNodes); + new_wl.prop = readFromFloat(1/numNodes); new_wl.activeNow = activeCondition(new_wl, items[i]); atom_active |= new_wl.activeNow; items[i] = new_wl; @@ -291,7 +291,7 @@ BSPPRWorkload::interIterationInit(WorkListItem& wl) float prop_float = writeToFloat(wl.prop); error += std::abs(temp_float - prop_float); wl.prop = wl.tempProp; - wl.tempProp = readFromFloat(1 - alpha); + wl.tempProp = readFromFloat((1 - alpha) / numNodes); wl.activeFuture = (wl.degree > 0); } diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh index d42bfd0f26..72748502c1 100644 --- a/src/accl/graph/base/graph_workload.hh +++ b/src/accl/graph/base/graph_workload.hh @@ -131,11 +131,15 @@ class PRWorkload : public GraphWorkload class BSPPRWorkload : public GraphWorkload { private: + int numNodes; float alpha; + float prevError; float error; public: - BSPPRWorkload(float alpha): alpha(alpha), error(0) {} + BSPPRWorkload(int num_nodes, float alpha): + numNodes(num_nodes), alpha(alpha), prevError(0), error(0) + {} ~BSPPRWorkload() {} @@ -143,12 +147,12 @@ class BSPPRWorkload : public GraphWorkload virtual uint32_t reduce(uint32_t update, uint32_t value); virtual uint32_t propagate(uint32_t value, uint32_t weight); virtual uint32_t apply(WorkListItem& wl); - virtual void iterate() { error = 0; } + virtual void iterate() { prevError = error; error = 0; } virtual void interIterationInit(WorkListItem& wl); virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); virtual std::string printWorkListItem(const WorkListItem wl); - float getError() { return error; } + float getError() { return prevError; } }; class BSPBCWorkload : public GraphWorkload diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 23eb6bbc0e..0aee3b77ce 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -82,9 +82,9 @@ CenteralController::createAsyncPRWorkload(float alpha, float threshold) } void -CenteralController::createPRWorkload(float alpha) +CenteralController::createPRWorkload(int num_nodes, float alpha) { - workload = new BSPPRWorkload(alpha); + workload = new BSPPRWorkload(num_nodes, alpha); } void diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index e73ed22666..cce9ac2725 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -74,7 +74,7 @@ class CenteralController : public ClockedObject void createSSSPWorkload(Addr init_addr, uint32_t init_value); void createCCWorkload(); void createAsyncPRWorkload(float alpha, float threshold); - void createPRWorkload(float alpha); + void createPRWorkload(int num_nodes, float alpha); void createBCWorkload(Addr init_addr, uint32_t init_value); void recvDoneSignal(); From da4decf6a2960a7489f1d8450069a9314dae21b0 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Tue, 7 Feb 2023 14:03:15 -0800 Subject: [PATCH 245/247] Fixing init in asyncPR. --- src/accl/graph/base/graph_workload.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc index ab58b02b73..fd802cf275 100644 --- a/src/accl/graph/base/graph_workload.cc +++ b/src/accl/graph/base/graph_workload.cc @@ -166,7 +166,8 @@ PRWorkload::init(PacketPtr pkt, WorkDirectory* dir) WorkListItem new_wl = items[index]; new_wl.tempProp = readFromFloat(0); new_wl.prop = readFromFloat(1 - alpha); - atom_active |= activeCondition(new_wl, items[index]); + new_wl.activeNow = activeCondition(new_wl, items[index]); + atom_active |= new_wl.activeNow; items[index] = new_wl; } if (atom_active) { From 7256874c4596608c6721768b3f06a1bd21f16879 Mon Sep 17 00:00:00 2001 From: Mahyar Samani Date: Thu, 9 Mar 2023 11:27:37 -0800 Subject: [PATCH 246/247] Improving UniqueFIFO implementation. --- src/accl/graph/base/data_structs.hh | 101 +++++++++++++++++++------ src/accl/graph/sega/coalesce_engine.cc | 6 ++ src/accl/graph/sega/push_engine.cc | 6 +- src/accl/graph/sega/work_directory.hh | 1 + 4 files changed, 87 insertions(+), 27 deletions(-) diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh index f09a0dd167..a391e0794d 100644 --- a/src/accl/graph/base/data_structs.hh +++ b/src/accl/graph/base/data_structs.hh @@ -34,7 +34,7 @@ #include #include -#include +#include namespace gem5 { @@ -137,56 +137,107 @@ template class UniqueFIFO { private: - std::list fifo; + int cap; + int pop; + + int* added; + int* deleted; + std::deque container; public: - UniqueFIFO() {} + UniqueFIFO() { + cap = 0; + pop = 0; + added = nullptr; + deleted = nullptr; + } - void push_back(T item) - { - if (!find(item)) { - fifo.push_back(item); + UniqueFIFO(int size) { + cap = size; + pop = 0; + + added = (int*) new int [cap]; + deleted = (int*) new int [cap]; + + for (int i = 0; i < cap; i++) { + added[i] = 0; + deleted[i] = 0; } + container.clear(); } - void pop_front() - { - assert(!fifo.empty()); - fifo.pop_front(); + void fix_front() { + while(true) { + T elem = container.front(); + if (deleted[elem] > 0) { + deleted[elem]--; + added[elem]--; + container.pop_front(); + } else { + assert(deleted[elem] == 0); + assert(added[elem] == 1); + break; + } + } } - T front() - { - return fifo.front(); + T front() { + fix_front(); + return container.front(); } size_t size() { - return fifo.size(); + return pop; } void clear() { - fifo.clear(); + pop = 0; + for (int i = 0; i < cap; i++) { + added[i] = 0; + deleted[i] = 0; + } + container.clear(); } bool empty() { - return fifo.empty(); + return size() == 0; } bool find(T item) { - // std::list::iterator it = std::find(fifo.begin(), fifo.end(), item); - auto it = std::find(fifo.begin(), fifo.end(), item); - return (it != fifo.end()); + assert(added[item] >= 0); + assert(deleted[item] >= 0); + int diff = added[item] - deleted[item]; + assert((diff == 0) || (diff == 1)); + return (diff == 1); + } + + void push_back(T item) { + if (!find(item)) { + added[item]++; + pop++; + container.push_back(item); + } + } + + void pop_front() { + T elem = front(); + assert(added[elem] == 1); + added[elem] = 0; + pop--; + container.pop_front(); } void erase(T item) { - // std::list::iterator it = std::find(fifo.begin(), fifo.end(), item); - auto it = std::find(fifo.begin(), fifo.end(), item); - assert(it != fifo.end()); - fifo.erase(it); + assert(find(item)); + deleted[item]++; + pop--; } void operator=(const UniqueFIFO& rhs) { - fifo = rhs.fifo; + pop = rhs.pop; + container = rhs.container; + added = rhs.added; + deleted = rhs.deleted; } }; diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc index 02c98ba640..8c38341f48 100644 --- a/src/accl/graph/sega/coalesce_engine.cc +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -69,6 +69,9 @@ CoalesceEngine::CoalesceEngine(const Params ¶ms): for (int i = 0; i < numLines; i++) { cacheBlocks[i] = Block(numElementsPerLine); } + currentActiveCacheBlocks = UniqueFIFO(numLines); + futureActiveCacheBlocks = UniqueFIFO(numLines); + activeBuffer.clear(); postPushWBQueue.clear(); } @@ -404,6 +407,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) Addr addr = pkt->getAddr(); int block_index = getBlockIndex(addr); ReadPurpose* purpose = pkt->findNextSenderState(); + // TODO: delete purpose // NOTE: Regardless of where the pkt will go we have to release the // reserved space for this pkt in the activeBuffer in case @@ -553,6 +557,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt) pullsScheduled++; } } + delete purpose; } if (done() && !nextDoneSignalEvent.scheduled()) { @@ -999,6 +1004,7 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick) void CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) { + DPRINTF(CoalesceEngine, "%s: processNextVertexPull called.\n", __func__); pullsScheduled--; if (!currentDirectory->empty()) { Addr addr = currentDirectory->getNextWork(); diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc index a8c9a1bcb1..981b581b7c 100644 --- a/src/accl/graph/sega/push_engine.cc +++ b/src/accl/graph/sega/push_engine.cc @@ -273,7 +273,9 @@ PushEngine::handleMemResp(PacketPtr pkt) // TODO: in case we need to edit edges, get rid of second statement. assert(pkt->isResponse() && (!pkt->isWrite())); - uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize]; + // uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize]; + // TODO: Change above line to below line. + uint8_t pkt_data [peerMemoryAtomSize]; PushInfo push_info = reqInfoMap[pkt->req]; pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize); @@ -291,7 +293,7 @@ PushEngine::handleMemResp(PacketPtr pkt) onTheFlyMemReqs -= push_info.numElements; reqInfoMap.erase(pkt->req); - delete pkt_data; + // delete [] pkt_data; delete pkt; if (!nextPropagateEvent.scheduled()) { diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh index 18430aee0d..620e97f654 100644 --- a/src/accl/graph/sega/work_directory.hh +++ b/src/accl/graph/sega/work_directory.hh @@ -100,6 +100,7 @@ class PopCountDirectory: public WorkDirectory for (int index = 0; index < numCounters; index++) { popCount[index] = 0; } + activeBlockIndices = UniqueFIFO(numCounters); } // CAUTION: This should only be called when the work From 8673a9d8449bba7cf2dc4734a651fbd10852acd8 Mon Sep 17 00:00:00 2001 From: Marjan Fariborz Date: Thu, 9 Mar 2023 14:06:05 -0800 Subject: [PATCH 247/247] Adding asynchronous temporal partitioning --- src/accl/graph/sega/MPU.py | 5 +- src/accl/graph/sega/centeral_controller.cc | 31 + src/accl/graph/sega/centeral_controller.hh | 5 + src/accl/graph/sega/coalesce_engine.hh | 4 + src/accl/graph/sega/coalesce_engine_s.cc | 1223 ++++++++++++++++++++ src/accl/graph/sega/mpu.cc | 52 +- src/accl/graph/sega/mpu.hh | 16 +- src/accl/graph/sega/wl_engine.cc | 14 + 8 files changed, 1343 insertions(+), 7 deletions(-) create mode 100644 src/accl/graph/sega/coalesce_engine_s.cc diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py index 8d2453b01c..79fa7db8d0 100644 --- a/src/accl/graph/sega/MPU.py +++ b/src/accl/graph/sega/MPU.py @@ -27,9 +27,10 @@ from m5.params import * from m5.proxy import * -from m5.SimObject import SimObject +# from m5.SimObject import SimObject +from m5.objects.ClockedObject import ClockedObject -class MPU(SimObject): +class MPU(ClockedObject): type = "MPU" cxx_header = "accl/graph/sega/mpu.hh" cxx_class = "gem5::MPU" diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc index 0aee3b77ce..fc4bacd414 100644 --- a/src/accl/graph/sega/centeral_controller.cc +++ b/src/accl/graph/sega/centeral_controller.cc @@ -93,6 +93,18 @@ CenteralController::createBCWorkload(Addr init_addr, uint32_t init_value) workload = new BSPBCWorkload(init_addr, init_value); } +bool +CenteralController::bufferRemoteUpdate(int slice_number, PacketPtr pkt) +{ + for (auto mpu: mpuVector) { + if (contains(mpu->getAddrRanges(), pkt->getAddr())) { + remoteUpdates[mpu][slice_number].push_back(pkt); + } + } + + return true; +} + void CenteralController::createPopCountDirectory(int atoms_per_block) { @@ -173,6 +185,25 @@ CenteralController::recvDoneSignal() bool done = true; for (auto mpu : mpuVector) { done &= mpu->done(); + int total_num_slices = remoteUpdates[mpu].size(); + if (mpu->done()) { + int slice_number = mpu->getSliceCounter() + 1; + while ((total_num_slices != 0) && (slice_number != mpu->getSliceCounter())) { + if (!remoteUpdates[mpu][slice_number].empty()) { + mpu->scheduleNewSlice(); + mpu->updateSliceCounter(slice_number); + done = false; + break; + } + else { + if (slice_number == total_num_slices) { + slice_number = 0; + } else { + slice_number++; + } + } + } + } } if (done && mode == ProcessingMode::ASYNCHRONOUS) { diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh index cce9ac2725..6692d999ed 100644 --- a/src/accl/graph/sega/centeral_controller.hh +++ b/src/accl/graph/sega/centeral_controller.hh @@ -77,11 +77,16 @@ class CenteralController : public ClockedObject void createPRWorkload(int num_nodes, float alpha); void createBCWorkload(Addr init_addr, uint32_t init_value); + bool bufferRemoteUpdate(int slice_number, PacketPtr pkt); + int getnumGPTs() {return mpuVector.size();} + void recvDoneSignal(); int workCount(); float getPRError(); void printAnswerToHostSimout(); + std::unordered_map>> + remoteUpdates; }; } diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh index b6eec725f9..10a71a7ef1 100644 --- a/src/accl/graph/sega/coalesce_engine.hh +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -217,6 +217,10 @@ class CoalesceEngine : public BaseMemoryEngine ReadReturnStatus recvWLRead(Addr addr); void recvWLWrite(Addr addr, WorkListItem wl); + int getSliceSize() + {return (int)(params().cache_size); } + // /sizeof(WorkListItem)); } + int workCount(); int futureWorkCount(); void recvVertexPull(); diff --git a/src/accl/graph/sega/coalesce_engine_s.cc b/src/accl/graph/sega/coalesce_engine_s.cc new file mode 100644 index 0000000000..6a5261d38c --- /dev/null +++ b/src/accl/graph/sega/coalesce_engine_s.cc @@ -0,0 +1,1223 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/coalesce_engine.hh" + +#include + +#include "accl/graph/sega/mpu.hh" +#include "base/intmath.hh" +#include "debug/CacheBlockState.hh" +#include "debug/CoalesceEngine.hh" +#include "debug/SEGAStructureSize.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +CoalesceEngine::CoalesceEngine(const Params ¶ms): + BaseMemoryEngine(params), mode(ProcessingMode::NOT_SET), lastAtomAddr(0), + numLines((int) (params.cache_size / peerMemoryAtomSize)), + numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), + onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle), + pullsReceived(0), pullsScheduled(0), + pendingPullLimit(params.pending_pull_limit), + pendingPullReads(0), activeBufferSize(params.active_buffer_size), + postPushWBQueueSize(params.post_push_wb_queue_size), + nextMemoryEvent([this] { + processNextMemoryEvent(); + }, name() + ".nextMemoryEvent"), + nextResponseEvent([this] { + processNextResponseEvent(); + }, name() + ".nextResponseEvent"), + nextApplyEvent([this] { + processNextApplyEvent(); + }, name() + ".nextApplyEvent"), + nextDoneSignalEvent([this] { + processNextDoneSignalEvent(); + }, name() + ".nextDoneSignalEvent"), + stats(*this) +{ + assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine)); + cacheBlocks = new Block [numLines]; + for (int i = 0; i < numLines; i++) { + cacheBlocks[i] = Block(numElementsPerLine); + } + activeBuffer.clear(); + postPushWBQueue.clear(); +} + +void +CoalesceEngine::registerMPU(MPU* mpu) +{ + owner = mpu; +} + + +// NOTE: Used for initializing memory and reading the final answer +void +CoalesceEngine::recvFunctional(PacketPtr pkt) +{ + if (pkt->isRead()) { + assert(pkt->getSize() == peerMemoryAtomSize); + Addr addr = pkt->getAddr(); + int block_index = getBlockIndex(addr); + + if ((cacheBlocks[block_index].addr == addr) && + (cacheBlocks[block_index].valid)) { + assert(cacheBlocks[block_index].state == CacheState::IDLE); + + pkt->makeResponse(); + pkt->setDataFromBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + } else { + memPort.sendFunctional(pkt); + } + } else { + graphWorkload->init(pkt, currentDirectory); + if (pkt->getAddr() > lastAtomAddr) { + lastAtomAddr = pkt->getAddr(); + } + memPort.sendFunctional(pkt); + } +} + +void +CoalesceEngine::postMemInitSetup() +{ + currentDirectory->setLastAtomAddr(lastAtomAddr); +} + +void +CoalesceEngine::postConsumeProcess() +{ + Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr); + for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) { + Addr addr = peerMemoryRange.addIntlvBits(local_addr); + int block_index = getBlockIndex(addr); + if (cacheBlocks[block_index].addr == addr) { + assert(cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].hasConflict); + assert(cacheBlocks[block_index].state == CacheState::IDLE); + bool atom_active_future_before = false; + bool atom_active_future_after = false; + for (int index = 0; index < numElementsPerLine; index++) { + assert(!cacheBlocks[block_index].items[index].activeNow); + atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture; + graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]); + atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture; + if (cacheBlocks[block_index].items[index].activeFuture) { + cacheBlocks[block_index].items[index].activeFuture = false; + cacheBlocks[block_index].items[index].activeNow = true; + cacheBlocks[block_index].dirty = true; + } + } + if (!atom_active_future_before && atom_active_future_after) { + futureActiveCacheBlocks.push_back(block_index); + } + if (atom_active_future_before && !atom_active_future_after) { + futureActiveCacheBlocks.erase(block_index); + } + } else { + WorkListItem items[numElementsPerLine]; + PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize); + memPort.sendFunctional(read_pkt); + read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + bool atom_active_future_before = false; + bool atom_active_future_after = false; + for (int index = 0; index < numElementsPerLine; index++) { + assert(!items[index].activeNow); + atom_active_future_before |= items[index].activeFuture; + graphWorkload->interIterationInit(items[index]); + atom_active_future_after |= items[index].activeFuture; + if (items[index].activeFuture) { + items[index].activeFuture = false; + items[index].activeNow = true; + } + } + if (!atom_active_future_before && atom_active_future_after) { + futureDirectory->activate(addr); + } + if (atom_active_future_before && !atom_active_future_after) { + futureDirectory->deactivate(addr); + } + PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items); + memPort.sendFunctional(write_pkt); + delete read_pkt; + delete write_pkt; + } + } +} + +void +CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block) +{ + currentDirectory = new PopCountDirectory( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); + futureDirectory = nullptr; +} + +void +CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block) +{ + currentDirectory = new PopCountDirectory( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); + futureDirectory = new PopCountDirectory( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); +} + +void +CoalesceEngine::swapDirectories() +{ + assert(currentDirectory->empty()); + assert(currentActiveCacheBlocks.empty()); + // assert currentDirectory is empty + WorkDirectory* temp = currentDirectory; + currentDirectory = futureDirectory; + futureDirectory = temp; + + currentActiveCacheBlocks.clear(); + currentActiveCacheBlocks = futureActiveCacheBlocks; + futureActiveCacheBlocks.clear(); +} + +bool +CoalesceEngine::done() +{ + return memoryFunctionQueue.empty() && currentActiveCacheBlocks.empty() && + activeBuffer.empty() && currentDirectory->empty() && (onTheFlyReqs == 0); +} + +bool +CoalesceEngine::enoughSpace() +{ + return (activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize; +} + +bool +CoalesceEngine::pullCondition() +{ + bool enough_space = enoughSpace(); + bool schedule_limit = pullsScheduled < pendingPullLimit; + return enough_space && schedule_limit; +} + +// addr should be aligned to peerMemoryAtomSize +int +CoalesceEngine::getBlockIndex(Addr addr) +{ + assert((addr % peerMemoryAtomSize) == 0); + Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); + return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines; +} + + + +ReadReturnStatus +CoalesceEngine::recvWLRead(Addr addr) +{ + Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); + assert(aligned_addr % peerMemoryAtomSize == 0); + int block_index = getBlockIndex(aligned_addr); + assert(block_index < numLines); + int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); + assert(wl_offset < numElementsPerLine); + //assert(addr in a right slice) + // assert((cacheBlocks[block_index].addr == aligned_addr)) + DPRINTF(CoalesceEngine, "%s: Received a read request for addr: %lu. " + "This request maps to cacheBlocks[%d], aligned_addr: " + "%lu, and wl_offset: %d.\n", __func__, addr, + block_index, aligned_addr, wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if ((cacheBlocks[block_index].addr == aligned_addr) and + (cacheBlocks[block_index].valid)) { + // Hit + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit.\n", __func__, addr); + stats.readHits++; + assert(cacheBlocks[block_index].state != CacheState::INVALID); + responseQueue.push_back(std::make_tuple( + addr, cacheBlocks[block_index].items[wl_offset], curTick())); + + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + // TODO: Stat to count the number of WLItems that have been touched. + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + cacheBlocks[block_index].state = CacheState::BUSY; + // HACK: If a read happens on the same cycle as another operation such + // as apply set lastChangedTick to half a cycle later so that operation + // scheduled by the original operation (apply in this example) are + // invalidated. For more details refer to "accl/graph/sega/busyMaskErr" + cacheBlocks[block_index].lastChangedTick = + curTick() + (Tick) (clockPeriod() / 2); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if (!nextResponseEvent.scheduled()) { + schedule(nextResponseEvent, nextCycle()); + } + stats.numVertexReads++; + return ReadReturnStatus::ACCEPT; + } else if (cacheBlocks[block_index].state == CacheState::PENDING_DATA) { + // Hit under miss + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit under miss.\n", + __func__, addr); + stats.readHitUnderMisses++; + assert(!cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].dirty); + + assert(MSHR.find(block_index) != MSHR.end()); + MSHR[block_index].push_back(addr); + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to MSHR " + "for cacheBlocks[%d].\n", __func__, addr, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + stats.numVertexReads++; + return ReadReturnStatus::ACCEPT; + } else { + // // miss + assert(cacheBlocks[block_index].addr != aligned_addr); + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a cold miss.\n", + __func__, addr); + stats.readMisses++; + // cold miss + assert(MSHR.find(block_index) == MSHR.end()); + cacheBlocks[block_index].addr = aligned_addr; + cacheBlocks[block_index].busyMask = 0; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].dirty = false; + cacheBlocks[block_index].hasConflict = false; + cacheBlocks[block_index].state = CacheState::PENDING_DATA; + cacheBlocks[block_index].lastChangedTick = curTick(); + + MSHR[block_index].push_back(addr); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextRead(block_index, schedule_tick); + }, block_index, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + return ReadReturnStatus::ACCEPT; + } + } +} + +bool +CoalesceEngine::handleMemResp(PacketPtr pkt) +{ + assert(pkt->isResponse()); + DPRINTF(CoalesceEngine, "%s: Received packet: %s from memory.\n", + __func__, pkt->print()); + + onTheFlyReqs--; + if (pkt->isWrite()) { + DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__); + delete pkt; + } else { + assert(pkt->isRead()); + Addr addr = pkt->getAddr(); + int block_index = getBlockIndex(addr); + ReadPurpose* purpose = pkt->findNextSenderState(); + + // NOTE: Regardless of where the pkt will go we have to release the + // reserved space for this pkt in the activeBuffer in case + // it was read from memory for placement in the activeBuffer. + // NOTE: Also we have to stop tracking the address for pullAddrs + if (purpose->dest() == ReadDestination::READ_FOR_PUSH) { + pendingPullReads--; + pendingPullAddrs.erase(addr); + } + if (cacheBlocks[block_index].addr == addr) { + // If it is in the cache, line should be in PENDING_DATA state. + // Regardless of the purpose for which it was read, it should + // be placed in the cache array. + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].dirty); + assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA); + + // NOTE: Since it is in PENDING_DATA state it + // should have an entry in the MSHR. + assert(MSHR.find(block_index) != MSHR.end()); + + pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items, + peerMemoryAtomSize); + + cacheBlocks[block_index].valid = true; + // HACK: In case the pkt was read for push but it was allocated + // for in the cache later on, we should cancel the future + // processNextRead for this block. We could set lastChangedTick + // to curTick() like usual. However, there is no way to ensure + // that processNextRead will be not be called on the same tick + // as the pkt arrives from the memory. Therefore, we will set + // the lastChangedTick to half a cycle before the actual time. + // We move that back in time because it would be fine if + // processNextRead happened before pkt arriveed. processNextRead + // actually will check if there is a pending read for push for + // the address it's trying to populate. + if (purpose->dest() == ReadDestination::READ_FOR_PUSH) { + cacheBlocks[block_index].lastChangedTick = + curTick() - (Tick) (clockPeriod() / 2); + } else { + cacheBlocks[block_index].lastChangedTick = curTick(); + } + + // NOTE: If the atom is active we have to deactivate the tracking + // of this atom in the memory since it's not in memory anymore. + // Since it is going to the cache, cache will be responsible for + // tracking this. Push to activeCacheBlocks for simulator speed + // instead of having to search for active blocks in the cache. + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_now) { + int count = currentDirectory->deactivate(addr); + currentActiveCacheBlocks.push_back(block_index); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); + } + if (atom_active_future) { + int count = futureDirectory->deactivate(addr); + futureActiveCacheBlocks.push_back(block_index); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); + } + + assert(MSHR.find(block_index) != MSHR.end()); + for (auto it = MSHR[block_index].begin(); + it != MSHR[block_index].end();) { + Addr miss_addr = *it; + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + + assert(aligned_miss_addr == cacheBlocks[block_index].addr); + int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " + "cacheBlocks[%d] can be serviced with the received " + "packet.\n",__func__, miss_addr, block_index); + responseQueue.push_back(std::make_tuple(miss_addr, + cacheBlocks[block_index].items[wl_offset], curTick())); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + it = MSHR[block_index].erase(it); + } + MSHR.erase(block_index); + + cacheBlocks[block_index].state = CacheState::BUSY; + if ((!nextResponseEvent.scheduled()) && (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } + delete pkt; + } else { + assert(purpose->dest() == ReadDestination::READ_FOR_PUSH); + // There should be enough room in activeBuffer to place this pkt. + // REMEMBER: If dest == READ_FOR_PUSH we release the reserved space. + // So at this point in code we should have at least one free entry + // in the active buffer which is reserved for this pkt. + assert(activeBuffer.size() + pendingPullReads < activeBufferSize); + + WorkListItem items[numElementsPerLine]; + pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= items[index].activeNow; + atom_active_future |= items[index].activeFuture; + } + if (atom_active_now) { + int count = currentDirectory->deactivate(addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); + if (atom_active_future) { + int count = futureDirectory->deactivate(addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); + } + activeBuffer.emplace_back(pkt, curTick()); + } else { + stats.wastefulBytesRead += pkt->getSize(); + delete pkt; + } + + if (pullCondition()) { + memoryFunctionQueue.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, 0, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + pullsScheduled++; + } + } + } + + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } + return true; +} + +void +CoalesceEngine::processNextResponseEvent() +{ + int num_responses_sent = 0; + + Addr addr_response; + WorkListItem worklist_response; + Tick response_queueing_tick; + while(true) { + std::tie(addr_response, worklist_response, response_queueing_tick) = + responseQueue.front(); + Tick waiting_ticks = curTick() - response_queueing_tick; + if (ticksToCycles(waiting_ticks) < 1) { + break; + } + owner->handleIncomingWL(addr_response, worklist_response); + num_responses_sent++; + DPRINTF(CoalesceEngine, + "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n", + __func__, + graphWorkload->printWorkListItem(worklist_response), + addr_response); + + responseQueue.pop_front(); + DPRINTF(SEGAStructureSize, "%s: Popped a response from responseQueue." + " responseQueue.size = %d.\n", __func__, + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Popped a response from responseQueue. " + "responseQueue.size = %d.\n", __func__, + responseQueue.size()); + stats.responseQueueLatency.sample( + waiting_ticks * 1e9 / getClockFrequency()); + if (num_responses_sent >= maxRespPerCycle) { + // TODO: Add the condition to check that front of queue can be + // sent to WLEngine. i.e. it has at least been in the queue for + // one cycle. + if (!responseQueue.empty()) { + stats.responsePortShortage++; + } + break; + } + if (responseQueue.empty()) { + break; + } + } + + if ((!nextResponseEvent.scheduled()) && + (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } +} + +void +CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) +{ + Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); + int block_index = getBlockIndex(aligned_addr); + int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Received a write request for addr: %lu with " + "wl: %s. This request maps to cacheBlocks[%d], " + "aligned_addr: %lu, and wl_offset: %d.\n", + __func__, addr, graphWorkload->printWorkListItem(wl), + block_index, aligned_addr, wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + DPRINTF(CoalesceEngine, "%s: Received a write for WorkListItem: %s " + "with Addr: %lu.\n", __func__, + graphWorkload->printWorkListItem(wl), addr); + + // NOTE: Design does not allow for write misses. + assert(cacheBlocks[block_index].addr == aligned_addr); + // cache state asserts + assert(cacheBlocks[block_index].busyMask != 0); + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].state == CacheState::BUSY); + + // respective bit in busyMask for wl is set. + assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) == + (1 << wl_offset)); + + if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) { + cacheBlocks[block_index].dirty |= true; + } + + bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]); + cacheBlocks[block_index].items[wl_offset] = wl; + if (mode == ProcessingMode::ASYNCHRONOUS) { + cacheBlocks[block_index].items[wl_offset].activeNow |= active; + if (active && (!currentActiveCacheBlocks.find(block_index))) { + currentActiveCacheBlocks.push_back(block_index); + if (!owner->running()) { + owner->start(); + } + } + } + if (mode == ProcessingMode::BULK_SYNCHRONOUS) { + cacheBlocks[block_index].items[wl_offset].activeFuture |= active; + if (active && (!futureActiveCacheBlocks.find(block_index))) { + futureActiveCacheBlocks.push_back(block_index); + } + } + + cacheBlocks[block_index].busyMask &= ~(1 << wl_offset); + cacheBlocks[block_index].lastChangedTick = curTick(); + DPRINTF(CoalesceEngine, "%s: Wrote to cacheBlocks[%d][%d] = %s.\n", + __func__, block_index, wl_offset, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset])); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if (cacheBlocks[block_index].busyMask == 0) { + if (cacheBlocks[block_index].hasConflict) { + if (cacheBlocks[block_index].dirty) { + cacheBlocks[block_index].state = CacheState::PENDING_WB; + cacheBlocks[block_index].lastChangedTick = curTick(); + memoryFunctionQueue.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextWriteBack(block_index, schedule_tick); + }, block_index, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + } else { + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_now) { + currentActiveCacheBlocks.erase(block_index); + int count = currentDirectory->activate(cacheBlocks[block_index].addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); + } + if (atom_active_future) { + futureActiveCacheBlocks.erase(block_index); + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); + } + cacheBlocks[block_index].reset(); + } + } else { + cacheBlocks[block_index].state = CacheState::IDLE; + cacheBlocks[block_index].lastChangedTick = curTick(); + } + } + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + stats.numVertexWrites++; + + if ((cacheBlocks[block_index].state == CacheState::IDLE) && + done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextMemoryEvent() +{ + if (memPort.blocked()) { + stats.numMemoryBlocks++; + nextMemoryEvent.sleep(); + return; + } + + DPRINTF(CoalesceEngine, "%s: Processing another " + "memory function.\n", __func__); + std::function next_memory_function; + int next_memory_function_input; + Tick next_memory_function_tick; + std::tie( + next_memory_function, + next_memory_function_input, + next_memory_function_tick) = memoryFunctionQueue.front(); + next_memory_function(next_memory_function_input, next_memory_function_tick); + memoryFunctionQueue.pop_front(); + stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick) + * 1e9 / getClockFrequency()); + DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. " + "memoryFunctionQueue.size = %d.\n", __func__, + memoryFunctionQueue.size()); + + assert(!nextMemoryEvent.pending()); + assert(!nextMemoryEvent.scheduled()); + if ((!memoryFunctionQueue.empty())) { + schedule(nextMemoryEvent, nextCycle()); + } + + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) +{ + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n", + __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", + __func__, block_index, cacheBlocks[block_index].to_string()); + // A cache block should not be touched while it's waiting for data. + // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick); + // TODO: Figure out if this is still necessary. + if (cacheBlocks[block_index].lastChangedTick != schedule_tick) { + return; + } + + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].dirty); + assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA); + + bool need_send_pkt = true; + + // NOTE: Search postPushWBQueue + for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end();) + { + PacketPtr wb_pkt = std::get<0>(*wb); + if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) { + wb_pkt->writeDataToBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + cacheBlocks[block_index].valid = true; + cacheBlocks[block_index].dirty = true; + cacheBlocks[block_index].lastChangedTick = curTick(); + // NOTE: If an atom is in the postPushWBQueue, + // the it is definitely currently not active. + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) + { + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_future) { + futureActiveCacheBlocks.push_back(block_index); + } + + need_send_pkt = false; + wb = postPushWBQueue.erase(wb); + delete wb_pkt; + } else { + wb++; + } + } + // NOTE: Search activeBuffer + for (auto ab = activeBuffer.begin(); ab != activeBuffer.end();) { + PacketPtr ab_pkt = std::get<0>(*ab); + if (cacheBlocks[block_index].addr == ab_pkt->getAddr()) { + ab_pkt->writeDataToBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + + cacheBlocks[block_index].valid = true; + cacheBlocks[block_index].dirty = true; + cacheBlocks[block_index].lastChangedTick = curTick(); + // If an atom is in the activeBuffer, + // then it is definitely currently active. + currentActiveCacheBlocks.push_back(block_index); + // NOTE: Residence in the activeBuffer does not + // signify anything about future activity. + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) + { + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_future) { + futureActiveCacheBlocks.push_back(block_index); + } + + need_send_pkt = false; + ab = activeBuffer.erase(ab); + delete ab_pkt; + if (pullCondition()) { + memoryFunctionQueue.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, 0, curTick()); + pullsScheduled++; + } + } else { + ab++; + } + } + if (!need_send_pkt) { + for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { + Addr miss_addr = *it; + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + assert(aligned_miss_addr == cacheBlocks[block_index].addr); + int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " + "cacheBlocks[%d] can be serviced with the received " + "packet.\n",__func__, miss_addr, block_index); + // TODO: Make this block of code into a function + responseQueue.push_back(std::make_tuple(miss_addr, + cacheBlocks[block_index].items[wl_offset], curTick())); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", + __func__, block_index, + cacheBlocks[block_index].to_string()); + it = MSHR[block_index].erase(it); + } + assert(MSHR[block_index].empty()); + MSHR.erase(block_index); + if ((!nextResponseEvent.scheduled()) && + (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } + cacheBlocks[block_index].state = CacheState::BUSY; + } + + if (pendingPullAddrs.find(cacheBlocks[block_index].addr) != + pendingPullAddrs.end()) { + need_send_pkt = false; + } + + if (need_send_pkt) { + PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr, + peerMemoryAtomSize); + ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_CACHE); + pkt->pushSenderState(purpose); + DPRINTF(CoalesceEngine, "%s: Created a read packet. addr = %lu, " + "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); + memPort.sendPacket(pkt); + onTheFlyReqs++; + } +} + +void +CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) +{ + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n", + __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if (schedule_tick == cacheBlocks[block_index].lastChangedTick) { + assert(cacheBlocks[block_index].busyMask == 0); + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].dirty); + assert(cacheBlocks[block_index].hasConflict); + assert(cacheBlocks[block_index].state == CacheState::PENDING_WB); + + // NOTE: If the atom we're writing back is active, we have to + // stop tracking it in the cache and start tracking it in the memory. + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + + PacketPtr pkt = createWritePacket( + cacheBlocks[block_index].addr, peerMemoryAtomSize, + (uint8_t*) cacheBlocks[block_index].items); + DPRINTF(CoalesceEngine, "%s: Created a write packet to " + "Addr: %lu, size = %d.\n", __func__, + pkt->getAddr(), pkt->getSize()); + if (atom_active_future) { + futureActiveCacheBlocks.erase(block_index); + } + if (atom_active_now) { + currentActiveCacheBlocks.erase(block_index); + if (enoughSpace()) { + activeBuffer.emplace_back(pkt, curTick()); + } else { + int count = currentDirectory->activate(cacheBlocks[block_index].addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.currentBlockActiveCount.sample(count); + if (atom_active_future) { + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); + } + memPort.sendPacket(pkt); + onTheFlyReqs++; + } + } else { + if (atom_active_future) { + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.futureBlockActiveCount.sample(count); + } + memPort.sendPacket(pkt); + onTheFlyReqs++; + } + cacheBlocks[block_index].reset(); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + } else { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a " + "write back has been scheduled for it. Ignoring " + "the current write back scheduled at tick %lu for " + "the right function scheduled later.\n", + __func__, block_index, schedule_tick); + } +} + +void +CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick) +{ + if (!postPushWBQueue.empty()) { + PacketPtr wb_pkt; + Tick pkt_tick; + std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front(); + if (schedule_tick == pkt_tick) { + WorkListItem items[numElementsPerLine]; + wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_future |= items[index].activeFuture; + } + if (atom_active_future) { + futureDirectory->activate(wb_pkt->getAddr()); + } + memPort.sendPacket(wb_pkt); + onTheFlyReqs++; + postPushWBQueue.pop_front(); + } + } +} + +void +CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) +{ + pullsScheduled--; + if (!currentDirectory->empty()) { + Addr addr = currentDirectory->getNextWork(); + int block_index = getBlockIndex(addr); + + bool in_cache = cacheBlocks[block_index].addr == addr; + bool in_active_buffer = false; + for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) { + PacketPtr pkt = std::get<0>(*ab); + in_active_buffer |= (pkt->getAddr() == addr); + } + bool in_write_buffer = false; + for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++) + { + PacketPtr pkt = std::get<0>(*wb); + in_write_buffer |= (pkt->getAddr() == addr); + } + bool repeat_work = pendingPullAddrs.find(addr) != pendingPullAddrs.end(); + + if (!in_cache && !in_active_buffer && !in_write_buffer && !repeat_work) { + PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); + ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_PUSH); + pkt->pushSenderState(purpose); + memPort.sendPacket(pkt); + onTheFlyReqs++; + pendingPullReads++; + pendingPullAddrs.insert(addr); + } + } +} + +void +CoalesceEngine::recvMemRetry() +{ + DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__); + + if (!nextMemoryEvent.pending()) { + DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__); + return; + } + assert(!nextMemoryEvent.scheduled()); + nextMemoryEvent.wake(); + schedule(nextMemoryEvent, nextCycle()); +} + +int +CoalesceEngine::workCount() +{ + return currentActiveCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size(); +} + +void +CoalesceEngine::recvVertexPull() +{ + pullsReceived++; + DPRINTF(CoalesceEngine, "%s: Received a vertex pull. pullsReceived: %d.\n", __func__, pullsReceived); + + stats.verticesPulled++; + stats.lastVertexPullTime = curTick() - stats.lastResetTick; + if (!nextApplyEvent.scheduled()) { + schedule(nextApplyEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextApplyEvent() +{ + if ((!activeBuffer.empty()) && + (postPushWBQueue.size() < postPushWBQueueSize)) { + PacketPtr pkt; + Tick entrance_tick; + WorkListItem items[numElementsPerLine]; + + std::tie(pkt, entrance_tick) = activeBuffer.front(); + pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + + for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) { + if (items[index].activeNow) { + Addr addr = pkt->getAddr() + index * sizeof(WorkListItem); + uint32_t delta = graphWorkload->apply(items[index]); + items[index].activeNow = false; + owner->recvVertexPush(addr, delta, items[index].edgeIndex, + items[index].degree); + pullsReceived--; + stats.verticesPushed++; + stats.lastVertexPushTime = curTick() - stats.lastResetTick; + } + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize); + + bool atom_active_now = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= items[index].activeNow; + } + // NOTE: If the atom is not active anymore. + if (!atom_active_now) { + PacketPtr wb_pkt = createWritePacket(pkt->getAddr(), + peerMemoryAtomSize, (uint8_t*) items); + postPushWBQueue.emplace_back(wb_pkt, curTick()); + activeBuffer.pop_front(); + memoryFunctionQueue.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextPostPushWB(ignore, schedule_tick); + }, 0, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + delete pkt; + } + } else if (!currentActiveCacheBlocks.empty()) { + int num_visited_indices = 0; + int initial_fifo_length = currentActiveCacheBlocks.size(); + while (true) { + int block_index = currentActiveCacheBlocks.front(); + if (cacheBlocks[block_index].state == CacheState::IDLE) { + for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) { + if (cacheBlocks[block_index].items[index].activeNow) { + Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem); + uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]); + cacheBlocks[block_index].items[index].activeNow = false; + cacheBlocks[block_index].dirty = true; + owner->recvVertexPush(addr, delta, + cacheBlocks[block_index].items[index].edgeIndex, + cacheBlocks[block_index].items[index].degree); + pullsReceived--; + stats.verticesPushed++; + stats.lastVertexPushTime = curTick() - stats.lastResetTick; + } + } + + bool atom_active_now = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + } + // NOTE: If we have reached the last item in the cache block + if (!atom_active_now) { + currentActiveCacheBlocks.erase(block_index); + } + break; + } + // NOTE: If the block with index at the front of activeCacheBlocks + // is not in IDLE state, then roll the that index to the back + currentActiveCacheBlocks.pop_front(); + currentActiveCacheBlocks.push_back(block_index); + // NOTE: If we have visited all the items initially in the FIFO. + num_visited_indices++; + if (num_visited_indices == initial_fifo_length) { + break; + } + } + } else { + DPRINTF(CoalesceEngine, "%s: Could not find work to apply.\n", __func__); + stats.worklessCycles++; + } + + if (pullCondition()) { + memoryFunctionQueue.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, 0, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + pullsScheduled++; + } + + if ((pullsReceived > 0) && (!nextApplyEvent.scheduled())) { + schedule(nextApplyEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextDoneSignalEvent() +{ + if (done()) { + owner->recvDoneSignal(); + } +} + +CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce) + : statistics::Group(&_coalesce), + coalesce(_coalesce), + lastResetTick(0), + ADD_STAT(numVertexReads, statistics::units::Count::get(), + "Number of memory vertecies read from cache."), + ADD_STAT(numVertexWrites, statistics::units::Count::get(), + "Number of memory vertecies written to cache."), + ADD_STAT(readHits, statistics::units::Count::get(), + "Number of cache hits."), + ADD_STAT(readMisses, statistics::units::Count::get(), + "Number of cache misses."), + ADD_STAT(readHitUnderMisses, statistics::units::Count::get(), + "Number of cache hit under misses."), + ADD_STAT(numConflicts, statistics::units::Count::get(), + "Number of conflicts raised by reads in the cache."), + ADD_STAT(responsePortShortage, statistics::units::Count::get(), + "Number of times a response has been " + "delayed because of port shortage. "), + ADD_STAT(numMemoryBlocks, statistics::units::Count::get(), + "Number of times memory bandwidth was not available."), + ADD_STAT(wastefulBytesRead, statistics::units::Byte::get(), + "Number of bytes read that were not used by coalesce engine"), + ADD_STAT(verticesPulled, statistics::units::Count::get(), + "Number of times a pull request has been sent by PushEngine."), + ADD_STAT(verticesPushed, statistics::units::Count::get(), + "Number of times a vertex has been pushed to the PushEngine"), + ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(), + "Time of the last pull request. (Relative to reset_stats)"), + ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(), + "Time of the last vertex push. (Relative to reset_stats)"), + ADD_STAT(worklessCycles, statistics::units::Count::get(), + "cycles the coalesce engine could not find work for apply"), + ADD_STAT(hitRate, statistics::units::Ratio::get(), + "Hit rate in the cache."), + ADD_STAT(vertexPullBW, statistics::units::Rate::get(), + "Rate at which pull requests arrive."), + ADD_STAT(vertexPushBW, statistics::units::Rate::get(), + "Rate at which vertices are pushed."), + ADD_STAT(currentFrontierSize, statistics::units::Count::get(), + "Histogram of the length of the current bitvector."), + ADD_STAT(futureFrontierSize, statistics::units::Count::get(), + "Histogram of the length of the future bitvector."), + ADD_STAT(currentBlockActiveCount, statistics::units::Count::get(), + "Histogram of the popCount values in the current directory"), + ADD_STAT(futureBlockActiveCount, statistics::units::Count::get(), + "Histogram of the popCount values in the future directory"), + ADD_STAT(responseQueueLatency, statistics::units::Second::get(), + "Histogram of the response latency to WLEngine. (ns)"), + ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(), + "Histogram of the latency of processing a memory function.") +{ +} + +void +CoalesceEngine::CoalesceStats::regStats() +{ + using namespace statistics; + + hitRate = (readHits + readHitUnderMisses) / + (readHits + readHitUnderMisses + readMisses); + + vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime; + + vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime; + + currentFrontierSize.init(64); + futureFrontierSize.init(64); + currentBlockActiveCount.init(64); + futureBlockActiveCount.init(64); + responseQueueLatency.init(64); + memoryFunctionLatency.init(64); +} + +void +CoalesceEngine::CoalesceStats::resetStats() +{ + statistics::Group::resetStats(); + + lastResetTick = curTick(); +} + +} // namespace gem5 diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc index f661bd68a6..318ea0798b 100644 --- a/src/accl/graph/sega/mpu.cc +++ b/src/accl/graph/sega/mpu.cc @@ -28,6 +28,8 @@ #include "accl/graph/sega/mpu.hh" +#include + #include "accl/graph/sega/centeral_controller.hh" #include "debug/MPU.hh" #include "mem/packet_access.hh" @@ -37,11 +39,13 @@ namespace gem5 { MPU::MPU(const Params& params): - SimObject(params), + ClockedObject(params), system(params.system), wlEngine(params.wl_engine), coalesceEngine(params.coalesce_engine), - pushEngine(params.push_engine) + pushEngine(params.push_engine), + sliceCounter(0), + nextSliceEvent([this] { processNextSliceEvent(); }, name()) { wlEngine->registerMPU(this); coalesceEngine->registerMPU(this); @@ -54,12 +58,56 @@ MPU::registerCenteralController(CenteralController* centeral_controller) centeralController = centeral_controller; } +int +MPU::getSliceSize() +{ + int slice_number = + (coalesceEngine->getSliceSize() * centeralController->getnumGPTs()); + + return slice_number; +} + +bool +MPU::bufferRemoteUpdate(int slice_number, PacketPtr pkt) +{ + return centeralController->bufferRemoteUpdate(slice_number, pkt); +} + bool MPU::handleIncomingUpdate(PacketPtr pkt) { return wlEngine->handleIncomingUpdate(pkt); } +void +MPU::scheduleNewSlice() +{ + if (!nextSliceEvent.scheduled()) { + schedule(nextSliceEvent, nextCycle()); + } + return; +} + +void +MPU::processNextSliceEvent() +{ + auto new_update = + centeralController->remoteUpdates[this][this->getSliceCounter()].front(); + bool sent = wlEngine->handleIncomingUpdate(new_update); + + centeralController->remoteUpdates[this] + [this->getSliceCounter()].pop_front(); + if (!sent) { + centeralController->remoteUpdates[this] + [this->getSliceCounter()].push_back(new_update); + } + + if (!centeralController->remoteUpdates[this][this->getSliceCounter()].empty() && !nextSliceEvent.scheduled()) { + schedule(nextSliceEvent, nextCycle()); + } + +} + void MPU::handleIncomingWL(Addr addr, WorkListItem wl) { diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh index 95d3adeca5..2008a7dc4f 100644 --- a/src/accl/graph/sega/mpu.hh +++ b/src/accl/graph/sega/mpu.hh @@ -39,7 +39,7 @@ #include "accl/graph/sega/wl_engine.hh" #include "base/addr_range.hh" #include "mem/packet.hh" -#include "sim/sim_object.hh" +#include "sim/clocked_object.hh" #include "sim/system.hh" #include "params/MPU.hh" @@ -48,7 +48,7 @@ namespace gem5 class CenteralController; -class MPU : public SimObject +class MPU : public ClockedObject { private: System* system; @@ -57,7 +57,10 @@ class MPU : public SimObject WLEngine* wlEngine; CoalesceEngine* coalesceEngine; PushEngine* pushEngine; + int sliceCounter; + EventFunctionWrapper nextSliceEvent; + void processNextSliceEvent(); public: PARAMS(MPU); MPU(const Params& params); @@ -74,8 +77,15 @@ class MPU : public SimObject void postConsumeProcess() { coalesceEngine->postConsumeProcess(); } void swapDirectories() { coalesceEngine->swapDirectories(); } - bool handleIncomingUpdate(PacketPtr pkt); + int getSliceSize(); + int getSliceCounter() { return sliceCounter; } + int increaseSliceCounter() { return sliceCounter++; } + void updateSliceCounter(int value) { sliceCounter = value;} + void resetSliceCounter() { sliceCounter = 0; } + bool bufferRemoteUpdate(int slice_number, PacketPtr pkt); + void scheduleNewSlice(); + bool handleIncomingUpdate(PacketPtr pkt); void handleIncomingWL(Addr addr, WorkListItem wl); ReadReturnStatus recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); } void recvWLWrite(Addr addr, WorkListItem wl); diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc index d563450179..b4649b6a9d 100644 --- a/src/accl/graph/sega/wl_engine.cc +++ b/src/accl/graph/sega/wl_engine.cc @@ -149,6 +149,19 @@ WLEngine::done() bool WLEngine::handleIncomingUpdate(PacketPtr pkt) { + int slice_number = (int)(pkt->getAddr()/(owner->getSliceSize())); + if (slice_number != owner->getSliceCounter()) { + DPRINTF(WLEngine, "%s: Packet %lu slice number is: %d. The current " + "slice number is: %d, The total number of vertices/slice: %d \n", + __func__, pkt->getAddr(), slice_number, + owner->getSliceCounter(), + owner->getSliceSize()/sizeof(WorkListItem)); + bool ret = owner->bufferRemoteUpdate(slice_number, pkt); + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } + return ret; + } assert((updateQueueSize == 0) || (updateQueue.size() <= updateQueueSize)); if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) { return false; @@ -173,6 +186,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt) return true; } + // TODO: Parameterize the number of pops WLEngine can do at a time. // TODO: Add a histogram stats of the size of the updateQueue. Sample here. void