From 0f3fc73c4df1158760d8959d7d66d3eb1265fd9e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 31 Jan 2022 15:52:52 -0800
Subject: [PATCH 001/247] accl: Adding src code for PushEngine.

---
 src/accl/push_engine.hh | 69 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 src/accl/push_engine.hh
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
new file mode 100644
index 0000000000..eda9d7b707
--- /dev/null
+++ b/src/accl/push_engine.hh
@@ -0,0 +1,69 @@
+#ifndef __ACCL_PUSH_ENGINE_HH__
+#define __ACCL_PUSH_ENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "base/addr_range_map.hh"
+#include "base/statistics.hh"
+#include "mem/port.hh"
+#include "mem/packet.hh"
+#include "params/PushEngine.hh"
+#include "sim/clocked_object.hh"
+
+class PushEngine : public ClockedObject
+{
+  private:
+
+    class PushRespPort : public ResponsePort
+    {
+      private:
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        PushRespPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+
+        virtual AddrRangeList getAddrRanges();
+        virtual bool recvTimingReq(PacketPtr pkt);
+    }
+
+    class PushReqPort : public RequestPort
+    {
+      private:
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        PushReqPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+    }
+
+    class PushMemPort : public RequestPort
+    {
+      private:
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        PushMemPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+        bool sendPacket(PacktPtr pkt);
+        virtual bool recvTimingResp(PacketPtr pkt);
+    }
+
+    PushRespPort respPort;
+    PushReqPort reqPort;
+    PushMemPort memPort;
+
+    std::queue<PacketPtr> vertexQueue;
+    std::queue<PacketPtr> updateQueue;
+
+    std::pair<Addr, int> interpretPackPtr(PacketPtr pkt);
+
+};
+
+#endif // __ACCL_PUSH_ENGINE_HH__

From 0dd0beb81d3910a313bb97c0c0dd1489e9f567ae Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 7 Feb 2022 17:56:49 -0800
Subject: [PATCH 002/247] Adding implementation for PushEngine (wip).

---
 src/accl/push_engine.cc | 120 ++++++++++++++++++++++++++++++++++++++++
 src/accl/push_engine.hh |  63 ++++++++++++++++++++-
 src/accl/util.cc        |  16 ++++++
 src/accl/util.hh        |   4 ++
 4 files changed, 200 insertions(+), 3 deletions(-)
 create mode 100644 src/accl/push_engine.cc
 create mode 100644 src/accl/util.cc
 create mode 100644 src/accl/util.hh

diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
new file mode 100644
index 0000000000..bc3138f61e
--- /dev/null
+++ b/src/accl/push_engine.cc
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/push_engine.hh"
+
+#include "debug/PushEngine.hh"
+
+PushEngine::PushEngine(const PushEngineParams& params):
+    ClockedObject(params),
+    system(params.system),
+    requestorId(system->getRequestorId(this)),
+    reqPort(name() + ".reqPort", this),
+    respPort(name() + ".respPort", this),
+    memPort(name() + ".memPort", this),
+    vertexQueueSize(params.vertex_queue_size),
+    vertexQueueLen(0),
+    updateQueue(params.update_queue_size),
+    updateQueueLen(0),
+    nextReceiveEvent([this]{ processNextReceiveEvent(); }, name()),
+    nextReadEvent([this]{ processNextReadEvent(); }, name()),
+    nextCreateEvent([this]{ processNextCreateEvent(); }, name()),
+    nextSendEvent([this]{ processNextSendEvent(); }, name())
+{}
+
+Port &
+PushEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "reqPort") {
+        return reqPort;
+    } else if (if_name == "respPort") {
+        return respPort;
+    } else if (if_name == "memPort") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+bool
+PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
+{
+    return owner->handleUpdate(pkt);
+}
+
+bool
+PushEngine::handleUpdate(PacketPtr pkt)
+{
+    if (vertexQueueLen < vertexQueueSize) {
+        vertexQueue.push(pkt)
+        vertexQueueLen++;
+        return true;
+
+        if (!nextReceiveEvent.scheduled()){
+            schedule(nextReceiveEvent, nextCycle());
+        }
+    }
+    return false;
+}
+
+void
+PushEngine::processNextReceiveEvent()
+{
+    PacketPtr updatePkt = vertexQueue.pop();
+    uint8_t* data = updatePkt->getData<uint8_t>();
+
+    Addr edgeListAddr = ; // TODO: Generalize finding this address.
+    int outDegree = ; // TODO: Generalize finding this value.
+
+    Addr reqAddr = (edgeListAddr / 64) * 64;
+    Addr offsetAddr = edgeListAddr % 64;
+
+    PacketPtr pkt = getReadPacket(reqAddr, 64, requestorId);
+
+    memPort.sendPacket(pkt);
+
+
+}
+
+void
+PushEngine::processNextReadEvent()
+{
+
+}
+
+void
+PushEngine::processNextCreateEvent()
+{
+
+}
+
+void
+PushEngine::processNextSendEvent()
+{
+
+}
\ No newline at end of file
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index eda9d7b707..6ab902d0e2 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -1,8 +1,35 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #ifndef __ACCL_PUSH_ENGINE_HH__
 #define __ACCL_PUSH_ENGINE_HH__
 
 #include <queue>
-#include <unordered_map>
 
 #include "base/addr_range_map.hh"
 #include "base/statistics.hh"
@@ -10,6 +37,7 @@
 #include "mem/packet.hh"
 #include "params/PushEngine.hh"
 #include "sim/clocked_object.hh"
+#include "sim/system.hh"
 
 class PushEngine : public ClockedObject
 {
@@ -18,6 +46,7 @@ class PushEngine : public ClockedObject
     class PushRespPort : public ResponsePort
     {
       private:
+        PushEngine* owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
@@ -55,14 +84,42 @@ class PushEngine : public ClockedObject
         virtual bool recvTimingResp(PacketPtr pkt);
     }
 
-    PushRespPort respPort;
+    System* const system;
+    const RequestorID requestorId;
+
     PushReqPort reqPort;
+    PushRespPort respPort;
+
     PushMemPort memPort;
 
     std::queue<PacketPtr> vertexQueue;
+    int vertexQueueSize;
+    int vertexQueueLen;
+
     std::queue<PacketPtr> updateQueue;
+    int updateQueueSize;
+    int updateQueueLen;
+
+    EventFunctionWrapper nextReceiveEvent;
+    void processNextReceiveEvent();
+
+    EventFunctionWrapper nextReadEvent;
+    void processNextReadEvent();
+
+    EventFunctionWrapper nextCreateEvent;
+    void processNextCreateEvent();
+
+    EventFunctionWrapper nextSendEvent;
+    void processNextSendEvent();
+
+    bool handleUpdate(PacketPtr pkt);
+
+  public:
+
+    PushEngine(const PushEngineParams &params);
 
-    std::pair<Addr, int> interpretPackPtr(PacketPtr pkt);
+    Port &getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
 
 };
 
diff --git a/src/accl/util.cc b/src/accl/util.cc
new file mode 100644
index 0000000000..20abd1c13a
--- /dev/null
+++ b/src/accl/util.cc
@@ -0,0 +1,16 @@
+#include "accl/util.hh"
+
+PacketPtr
+getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr)requestorId) << 2);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
diff --git a/src/accl/util.hh b/src/accl/util.hh
new file mode 100644
index 0000000000..c621b9e45c
--- /dev/null
+++ b/src/accl/util.hh
@@ -0,0 +1,4 @@
+#include "mem/packet.hh"
+
+PacketPtr getReadPacket(Addr addr, unsigned int size);
+

From 3b359ade313c989b465a5879d738096526cbf6c4 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Feb 2022 13:36:08 -0800
Subject: [PATCH 003/247] Adding util source code.

---
 src/accl/util.cc | 28 ++++++++++++++++++++++++++++
 src/accl/util.hh | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/src/accl/util.cc b/src/accl/util.cc
index 20abd1c13a..8d975c482f 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -1,3 +1,31 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #include "accl/util.hh"
 
 PacketPtr
diff --git a/src/accl/util.hh b/src/accl/util.hh
index c621b9e45c..18b8e4c197 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -1,4 +1,50 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "base/types.hh"
 #include "mem/packet.hh"
 
-PacketPtr getReadPacket(Addr addr, unsigned int size);
+struct WorkListItem
+{
+    uint32_t temp_prop;
+    uint32_t prop;
+    uint32_t degree;
+    Addr edgeList;
+}
+
+struct Edge
+{
+    uint32_t weight;
+    Addr neighbor;
+}
+
+WorkListItem& memoryToWorkList(uint8_t* data);
+Edge& memoryToEdge(uint8_t* data);
 
+PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId);
+PacketPtr getWritePacket(Addr addr, unsigned int size, uint8_t* data, RequestorID requestorId);

From f74e9df55bafd83ea180ad6b9db91840f0e3b9e5 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 31 Jan 2022 11:34:07 -0800
Subject: [PATCH 004/247] Adding the first version of Apply engine

---
 src/accl/apply.cc | 129 ++++++++++++++++++++++++++++++++++++++++++++
 src/accl/apply.hh | 132 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 261 insertions(+)
 create mode 100644 src/accl/apply.cc
 create mode 100644 src/accl/apply.hh

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
new file mode 100644
index 0000000000..d0e2b712a6
--- /dev/null
+++ b/src/accl/apply.cc
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/apply.h"
+
+#include <string>
+
+
+typedef std::pair<PacketPtr, PortID> ReqPair;
+typedef std::pair<uint64_t, PortID> QueuePair;
+
+Apply::Apply(const ApplyParams &params):
+    ClockedObject(params),
+    nextApplyEvent([this]{processNextApplyEvent; }, name()),
+    nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name()),
+    queueSize(params.applyQueueSize) //add this to .py
+{
+    applyReadQueue(queueSize);
+    pplyWriteQueue(queueSize);
+}
+
+bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
+{
+    if (!owner->handleWL(pkt)){
+        return false;
+    }
+    return true;
+}
+
+bool Apply::handleWL(PacketPtr pkt){
+    auto queue = applyReadQueue;
+    if (queue->blocked()){
+        sendPktRetry = true;
+        return false;
+    } else
+        queue->push(pkt);
+
+    if(!nextApplyCheckEvent.scheduled()){
+        schedule(nextApplyCheckEvent, nextCycle());
+    }
+    return true;
+}
+
+
+void Apply::processNextApplyCheckEvent(){
+    auto queue = applyReadQueue;
+    memPort = ApplyMemPort
+    while(!queue.empty()){
+        auto pkt = queue.pop()
+        /// conver to ReadReq
+        bool ret = memPort->sendPacket(pkt);
+        // handel responsehere
+        if (!ret)
+            break;
+    }
+
+}
+
+virtual bool
+Apply::MPUMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return owner->handleMemResp(pkt);
+}
+
+bool
+Apply::handleMemResp(PacktPtr pkt)
+{
+    auto queue = applyWriteQueue;
+    //check pkt (temp_prop != prop)
+    if (temp_prop != prop){
+        //update prop with temp_prop
+        if (queue->blocked()){
+            sendPktRetry = true;
+            return false;
+        } else
+            queue->push(pkt);
+
+        if(!nextApplyEvent.scheduled()){
+            schedule(nextApplyEvent, nextCycle());
+        }
+        return true;
+    }
+    return true;
+}
+
+
+
+void
+Apply::processNextApplyEvent(){
+    auto queue = applyWriteQueue;
+    memPort = ApplyMemPort;
+    pushPort = ApplyReqPort;
+    while(!queue.empty()){
+        auto pkt = queue.pop()
+        /// conver to ReadReq
+        bool ret = memPort->sendPacket(pkt);
+        bool push = pushPort->sendPacket(pkt);
+        // handel responsehere
+        if (!ret || !push)
+            break;
+
+    }
+
+}
\ No newline at end of file
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
new file mode 100644
index 0000000000..2ae593a1cb
--- /dev/null
+++ b/src/accl/apply.hh
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_APPLY_HH__
+#define __ACCL_APPLY_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "base/addr_range_map.hh"
+#include "base/statistics.hh"
+#include "mem/port.hh"
+#include "mem/packet.hh"
+#include "params/MPU.hh"
+#include "sim/clocked_object.hh"
+
+class Apply : public ClockedObject
+{
+  private:
+
+    class ApplyRespPort : public ResponsePort
+    {
+      private:
+        Apply *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        ApplyRespPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+
+        virtual AddrRangeList getAddrRanges();
+        virtual bool recvTimingReq(PacketPtr pkt);
+    }
+
+    class ApplyReqPort : public RequestPort
+    {
+      private:
+        APPLY *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+        struct ApplyQueue{
+          std::queue<PacketPtr> applyQueue;
+          const uint_32 queueSize;
+          bool sendPktRetry;
+
+          bool blocked(){
+            return applyQueue.size() == queueSize;
+          }
+          bool empty(){
+            return applyQueue.empty();
+          }
+          void push(PacketPtr pkt){
+            applyQueue.push(pkt);
+          }
+
+          ApplyQueue(uint32_t qSize):
+            queueSize(qSize){}
+        };
+      public:
+        ApplyReqPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+    }
+
+    class ApplyMemPort : public RequestPort
+    {
+      private:
+        Apply *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+      public:
+        ApplyReqPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+        bool sendPacket(PacktPtr pkt);
+        virtual bool recvTimingResp(PacketPtr pkt);
+
+    }
+    bool handleWL(PacketPtr pkt);
+    bool sendPacket();
+    //one queue for write and one for read a priotizes write over read
+    void readApplyBuffer();
+    bool handleMemResp(PacktPtr resp);
+    void writePushBuffer();
+
+
+    //Events
+    void processNextApplyCheckEvent();
+    /* Syncronously checked
+       If there are any active vertecies:
+       create memory read packets + MPU::MPU::MemPortsendTimingReq
+    */
+    void processNextApplyEvent();
+    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
+       Perform apply and send the write request and read edgeList
+       read + write
+       Write edgelist loc in buffer
+    */
+
+    ApplyQueue applyQueue;
+    ApplyMemPort memPort;
+   public(const ApplyParams &apply);
+};
+
+#endif // __ACCL_APPLY_HH__
\ No newline at end of file

From 7945cf333644c9ad0f0e5dfb99e8040d3944785d Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sat, 5 Feb 2022 20:34:12 -0800
Subject: [PATCH 005/247] Portotyping memory interface

---
 src/accl/apply.cc | 36 ++++++++++++++++++++++--------------
 src/accl/apply.hh |  8 +++++---
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index d0e2b712a6..b0ef5e8513 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -46,7 +46,7 @@ Apply::Apply(const ApplyParams &params):
 
 bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
 {
-    if (!owner->handleWL(pkt)){
+    if (!this->handleWL(pkt)){
         return false;
     }
     return true;
@@ -73,7 +73,9 @@ void Apply::processNextApplyCheckEvent(){
     while(!queue.empty()){
         auto pkt = queue.pop()
         /// conver to ReadReq
-        bool ret = memPort->sendPacket(pkt);
+        RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+        bool ret = memPort->sendPacket(memPkt);
         // handel responsehere
         if (!ret)
             break;
@@ -84,27 +86,24 @@ void Apply::processNextApplyCheckEvent(){
 virtual bool
 Apply::MPUMemPort::recvTimingResp(PacketPtr pkt)
 {
-    return owner->handleMemResp(pkt);
+    return this->handleMemResp(pkt);
 }
 
 bool
 Apply::handleMemResp(PacktPtr pkt)
 {
     auto queue = applyWriteQueue;
-    //check pkt (temp_prop != prop)
-    if (temp_prop != prop){
-        //update prop with temp_prop
+
         if (queue->blocked()){
             sendPktRetry = true;
             return false;
         } else
-            queue->push(pkt);
+            queue->push(writePkt);
 
         if(!nextApplyEvent.scheduled()){
             schedule(nextApplyEvent, nextCycle());
         }
         return true;
-    }
     return true;
 }
 
@@ -117,12 +116,21 @@ Apply::processNextApplyEvent(){
     pushPort = ApplyReqPort;
     while(!queue.empty()){
         auto pkt = queue.pop()
-        /// conver to ReadReq
-        bool ret = memPort->sendPacket(pkt);
-        bool push = pushPort->sendPacket(pkt);
-        // handel responsehere
-        if (!ret || !push)
-            break;
+        uint64_t* data = pkt->getPtr<uint64_t>();
+        uint32_t* prop = data;
+        uint32_t* temp_prop = prop + 1;
+        if (*temp_prop != *prop){
+            //update prop with temp_prop
+            *prop = min(*prop , *temp_prop);
+            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
+            writePkt->setData(data);
+            bool ret = memPort->sendPacket(pkt);
+            bool push = pushPort->sendPacket(pkt);
+            // handel response here
+            if (!ret || !push)
+                break;
+        }
 
     }
 
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index 2ae593a1cb..e9c27a1fcf 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -61,7 +61,7 @@ class Apply : public ClockedObject
     class ApplyReqPort : public RequestPort
     {
       private:
-        APPLY *owner;
+        Apply *owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
@@ -124,9 +124,11 @@ class Apply : public ClockedObject
        Write edgelist loc in buffer
     */
 
-    ApplyQueue applyQueue;
+    ApplyQueue applyReadQueue;
+    ApplyQueue applyWriteQueue;
     ApplyMemPort memPort;
-   public(const ApplyParams &apply);
+    std::pair<Addr, int>
+   public(const ApplyParams &apply);  //fix this
 };
 
 #endif // __ACCL_APPLY_HH__
\ No newline at end of file

From 14426cddc9527e56cf96cb15d7382199e4309e98 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 11 Feb 2022 12:04:02 -0800
Subject: [PATCH 006/247] [wip] Improving the implementation. Adding address
 range, python params.

---
 src/accl/Apply.py |  39 ++++++++++++
 src/accl/apply.cc | 153 +++++++++++++++++++++++++++++++++++-----------
 src/accl/apply.hh |  42 ++++++++++---
 3 files changed, 191 insertions(+), 43 deletions(-)
 create mode 100644 src/accl/Apply.py

diff --git a/src/accl/Apply.py b/src/accl/Apply.py
new file mode 100644
index 0000000000..01c627d4c8
--- /dev/null
+++ b/src/accl/Apply.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.SimObject import SimObject
+from m5.objects.ClockedObject import ClockedObject
+
+class Apply(ClockedObject):
+    type = 'Apply'
+    cxx_header = "accl/apply.hh"
+    cxx_class = 'gem5::Apply'
+
+    respPort = ResponsePort("Receives requests from WorkList")
+    reqPort  = RequestPort("Sends requests to Push")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index b0ef5e8513..d605537033 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -26,22 +26,41 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/apply.h"
+#include "accl/apply.hh"
 
 #include <string>
 
-
-typedef std::pair<PacketPtr, PortID> ReqPair;
-typedef std::pair<uint64_t, PortID> QueuePair;
-
 Apply::Apply(const ApplyParams &params):
     ClockedObject(params),
+    reqPort(name() + ".reqPort", this),
+    respPort(name() + ".respPort", this),
+    memPort(name() + ".memPort", this),
     nextApplyEvent([this]{processNextApplyEvent; }, name()),
     nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name()),
     queueSize(params.applyQueueSize) //add this to .py
 {
     applyReadQueue(queueSize);
-    pplyWriteQueue(queueSize);
+    applyWriteQueue(queueSize);
+}
+
+Port &
+Apply::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "reqPort") {
+        return reqPort;
+    } else if (if_name == "respPort") {
+        return respPort;
+    } else if (if_name == "memPort") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+AddrRangeList
+Apply::ApplyRespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
 }
 
 bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
@@ -52,6 +71,65 @@ bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
     return true;
 }
 
+void
+Apply::ApplyRespPort::trySendRetry()
+{
+    sendRetryReq();
+}
+
+
+virtual bool
+Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return this->handleMemResp(pkt);
+}
+
+void
+WLEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+Apply::ApplyMemPort::trySendRetry()
+{
+    sendRetryReq();
+}
+
+void
+Apply::ApplyMemPort::recvReqRetry()
+{
+    _blocked = false;
+    sendPacket(blockedPacket);
+    blockedPacket = nullptr;
+}
+
+void
+WLEngine::ApplyRequestPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+Apply::ApplyRequestPort::recvReqRetry()
+{
+    _blocked = false;
+    sendPacket(blockedPacket);
+    blockedPacket = nullptr;
+}
+
+AddrRangeList
+Apply::getAddrRanges() const
+{
+    return memPort.getAddrRanges();
+}
+
 bool Apply::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
     if (queue->blocked()){
@@ -59,34 +137,29 @@ bool Apply::handleWL(PacketPtr pkt){
         return false;
     } else
         queue->push(pkt);
-
     if(!nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
     }
     return true;
 }
 
-
 void Apply::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
-    memPort = ApplyMemPort
     while(!queue.empty()){
-        auto pkt = queue.pop()
-        /// conver to ReadReq
-        RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
-        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
-        bool ret = memPort->sendPacket(memPkt);
-        // handel responsehere
-        if (!ret)
-            break;
+        if(!memPort->blocked()){
+            auto pkt = queue.pop();
+            if(queue->sendPktRetry && !queue->blocked()){
+                    respPort->trySendRetry();
+                    queue->sendPktRetry = false;
+            }
+            // conver to ReadReq
+            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+            memPort->sendPacket(memPkt);
+        }
+        else
+            return;
     }
-
-}
-
-virtual bool
-Apply::MPUMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return this->handleMemResp(pkt);
 }
 
 bool
@@ -107,31 +180,39 @@ Apply::handleMemResp(PacktPtr pkt)
     return true;
 }
 
-
-
 void
 Apply::processNextApplyEvent(){
     auto queue = applyWriteQueue;
-    memPort = ApplyMemPort;
-    pushPort = ApplyReqPort;
     while(!queue.empty()){
-        auto pkt = queue.pop()
+        auto pkt = queue.front();
         uint64_t* data = pkt->getPtr<uint64_t>();
         uint32_t* prop = data;
         uint32_t* temp_prop = prop + 1;
         if (*temp_prop != *prop){
             //update prop with temp_prop
             *prop = min(*prop , *temp_prop);
-            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            RequestPtr req =
+                std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
             PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
             writePkt->setData(data);
-            bool ret = memPort->sendPacket(pkt);
-            bool push = pushPort->sendPacket(pkt);
-            // handel response here
-            if (!ret || !push)
+            if (!memPort->blocked() && !reqPort->blocked()){ //re-think this
+                memPort->sendPacket(pkt);
+                applyReqPort->sendPacket(pkt);
+                queue.pop();
+                if(queue->sendPktRetry && !queue->blocked()){
+                    memPort->trySendRetry();
+                    queue->sendPktRetry = false;
+                }
+            }
+            else
                 break;
         }
-
+        else{
+            queue.pop();
+            if(queue->sendPktRetry && !queue->blocked()){
+                memPort->trySendRetry();
+                queue->sendPktRetry = false;
+            }
+        }
     }
-
 }
\ No newline at end of file
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index e9c27a1fcf..fab4cf871a 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -56,6 +56,7 @@ class Apply : public ClockedObject
 
         virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
+        void trySendRetry();
     }
 
     class ApplyReqPort : public RequestPort
@@ -64,7 +65,6 @@ class Apply : public ClockedObject
         Apply *owner;
         bool _blocked;
         PacketPtr blockedPacket;
-
         struct ApplyQueue{
           std::queue<PacketPtr> applyQueue;
           const uint_32 queueSize;
@@ -83,12 +83,19 @@ class Apply : public ClockedObject
           ApplyQueue(uint32_t qSize):
             queueSize(qSize){}
         };
+
       public:
         ApplyReqPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
+        void sendPacket(PacketPtr pkt);
+        bool blocked(){
+          return _blocked;
+        }
 
+      protected:
+        void recvReqRetry() override;
         virtual bool recvTimingResp(PacketPtr pkt);
-    }
+    };
 
     class ApplyMemPort : public RequestPort
     {
@@ -96,13 +103,21 @@ class Apply : public ClockedObject
         Apply *owner;
         bool _blocked;
         PacketPtr blockedPacket;
+
       public:
         ApplyReqPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
-        bool sendPacket(PacktPtr pkt);
+        void sendPacket(PacketPtr pkt);
+        void trySendRetry();
+        bool blocked(){
+          return _blocked;
+        }
+
+      protected:
         virtual bool recvTimingResp(PacketPtr pkt);
+        void recvReqRetry() override;
+    };
 
-    }
     bool handleWL(PacketPtr pkt);
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
@@ -110,7 +125,6 @@ class Apply : public ClockedObject
     bool handleMemResp(PacktPtr resp);
     void writePushBuffer();
 
-
     //Events
     void processNextApplyCheckEvent();
     /* Syncronously checked
@@ -124,11 +138,25 @@ class Apply : public ClockedObject
        Write edgelist loc in buffer
     */
 
+    void processNextApplyEvent();
+    EventFunctionWrapper nextApplyEvent;
+
+    void processNextApplyCheckEvent();
+    EventFunctionWrapper nextApplyCheckEvent;
+
+    AddrRangeList getAddrRanges() const;
+
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
+
     ApplyMemPort memPort;
-    std::pair<Addr, int>
-   public(const ApplyParams &apply);  //fix this
+    ApplyRespPort respPort;
+    ApplyRequestPort reqPort;
+
+  public:
+    Apply(const ApplyParams &apply);
+    Port &getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
 };
 
 #endif // __ACCL_APPLY_HH__
\ No newline at end of file

From 8e79d19e2028a80dda8aa7b2026a010310fec300 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 11 Feb 2022 13:14:27 -0800
Subject: [PATCH 007/247] [wip] minor fixes to Apply engine

---
 src/accl/apply.cc |  8 ++++----
 src/accl/apply.hh | 44 +++++++++++++++++++++++---------------------
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index d605537033..6ad630f0ac 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -96,7 +96,7 @@ WLEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
 void
 Apply::ApplyMemPort::trySendRetry()
 {
-    sendRetryReq();
+    sendRetryResp();
 }
 
 void
@@ -108,7 +108,7 @@ Apply::ApplyMemPort::recvReqRetry()
 }
 
 void
-WLEngine::ApplyRequestPort::sendPacket(PacketPtr pkt)
+WLEngine::ApplyReqPort::sendPacket(PacketPtr pkt)
 {
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
@@ -117,7 +117,7 @@ WLEngine::ApplyRequestPort::sendPacket(PacketPtr pkt)
 }
 
 void
-Apply::ApplyRequestPort::recvReqRetry()
+Apply::ApplyReqtPort::recvReqRetry()
 {
     _blocked = false;
     sendPacket(blockedPacket);
@@ -158,7 +158,7 @@ void Apply::processNextApplyCheckEvent(){
             memPort->sendPacket(memPkt);
         }
         else
-            return;
+            break;
     }
 }
 
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index fab4cf871a..dae3d8ec0e 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -43,11 +43,29 @@ class Apply : public ClockedObject
 {
   private:
 
+    struct ApplyQueue{
+      std::queue<PacketPtr> applyQueue;
+      const uint_32 queueSize;
+      bool sendPktRetry;
+
+      bool blocked(){
+        return applyQueue.size() == queueSize;
+      }
+      bool empty(){
+        return applyQueue.empty();
+      }
+      void push(PacketPtr pkt){
+        applyQueue.push(pkt);
+      }
+
+      ApplyQueue(uint32_t qSize):
+        queueSize(qSize){}
+    };
+
     class ApplyRespPort : public ResponsePort
     {
       private:
         Apply *owner;
-        bool _blocked;
         PacketPtr blockedPacket;
 
       public:
@@ -55,9 +73,11 @@ class Apply : public ClockedObject
               PortID id=InvalidPortID);
 
         virtual AddrRangeList getAddrRanges();
-        virtual bool recvTimingReq(PacketPtr pkt);
         void trySendRetry();
-    }
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+    };
 
     class ApplyReqPort : public RequestPort
     {
@@ -65,24 +85,6 @@ class Apply : public ClockedObject
         Apply *owner;
         bool _blocked;
         PacketPtr blockedPacket;
-        struct ApplyQueue{
-          std::queue<PacketPtr> applyQueue;
-          const uint_32 queueSize;
-          bool sendPktRetry;
-
-          bool blocked(){
-            return applyQueue.size() == queueSize;
-          }
-          bool empty(){
-            return applyQueue.empty();
-          }
-          void push(PacketPtr pkt){
-            applyQueue.push(pkt);
-          }
-
-          ApplyQueue(uint32_t qSize):
-            queueSize(qSize){}
-        };
 
       public:
         ApplyReqPort(const std::string& name, SimObject* _owner,

From 469a8f7f7897289d5295500f18e7a60e691123d0 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 7 Feb 2022 12:26:01 -0800
Subject: [PATCH 008/247] Worklist engine implementation

---
 src/accl/wl_engine.cc | 185 ++++++++++++++++++++++++++++++++++++++++++
 src/accl/wl_engine.hh | 143 ++++++++++++++++++++++++++++++++
 2 files changed, 328 insertions(+)
 create mode 100644 src/accl/wl_engine.cc
 create mode 100644 src/accl/wl_engine.hh

diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
new file mode 100644
index 0000000000..28f8a4fe11
--- /dev/null
+++ b/src/accl/wl_engine.cc
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/wl_engine.hh"
+
+#include <string>
+
+
+WLEngine::WLEngine(const WLEngineParams &params):
+    ClockedObject(params),
+    nextWLReadEvent([this]{processNextWLReadEvent; }, name()),
+    nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()),
+    queueSize(params.wlQueueSize) //add this to .py
+{
+    wlReadQueue(queueSize);
+    wlWriteQueue(queueSize);
+}
+
+bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
+{
+    if (!this->handleWLUpdate(pkt)){
+        return false;
+    }
+    return true;
+}
+
+bool WLEngine::handleWLUpdate(PacketPtr pkt){
+    auto queue = wlReadQueue;
+    if (queue->blocked()){
+        queue->sendPktRetry = true;
+        return false;
+    } else
+        queue->push(pkt);
+
+    if(!nextWLReadEvent.scheduled()){
+        schedule(nextWLReadEvent, nextCycle());
+    }
+    return true;
+}
+
+
+void WLEngine::processNextWLReadEvent(){
+    auto queue = wlReadQueue;
+    memPort = WLMemPort
+    while(!queue.empty()){ //create a map instead of front
+        auto pkt = queue.front()
+        /// conver to ReadReq
+        RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+        if (!memPort->blocked()){
+            memPort->sendPacket(memPkt);
+            break;
+        }
+    }
+
+}
+
+void
+WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+WLEngine::WLMemPort::recvReqRetry()
+{
+    // We should have a blocked packet if this function is called.
+    assert(_blocked && blockedPacket != nullptr);
+    _blocked = false;
+    sendPacket(blockedPacket);
+    blockedPacket = nullptr;
+
+    owner->wakeUp(); //TODO
+}
+
+virtual bool
+WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return this->handleMemResp(pkt);
+}
+
+bool
+WLEngine::handleMemResp(PacktPtr pkt)
+{
+    auto queue = applyWriteQueue;
+        if (queue->blocked()){
+            sendPktRetry = true;
+            return false;
+        } else
+            queue->push(writePkt);
+
+        if(!nextWLReduceEvent.scheduled()){
+            schedule(nextWLReduceEvent, nextCycle());
+        }
+        return true;
+    return true;
+}
+
+void
+WLEngine::processNextWLReduceEvent(){
+    auto queue = wlWriteQueue;
+    auto updateQ = wlReadQueue;
+    memPort = WLMemPort;
+    applyPort = WLReqPort;
+    while(!queue.empty()){
+        auto update = updateQ.pop()
+        if (!updateQ->blocked() & updateQ->sendPktRetry){
+            WLRespPort->trySendRetry();
+            updateQ->sendPktRetry = false;
+        }
+        auto pkt = queue.front()
+        uint64_t* updatePtr = pkt->getPtr<uint64_t>();
+        uint64_t* data = pkt->getPtr<uint64_t>();
+        uint32_t* value = updatePtr;
+        uint32_t* temp_prop = prop + 1;
+        if (*value != *prop){
+            //update prop with temp_prop
+            *temp_prop = min(*value , *temp_prop);
+            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
+            writePkt->setData(data);
+            if (!memPort->blocked() && !applyPort->blocked()){
+                memPort->sendPacket(pkt);
+                applyPort->sendPacket(pkt);
+                queue.pop();
+                if (!queue->blocked() && queue->sendPktRetry){
+                    memPort->trySendRetry();
+                    queue->sendPktRetry = false;
+                }
+            }
+            else
+                break;
+        }
+        else{
+            queue.pop();
+            if (!queue->blocked() && queue->sendPktRetry){
+                memPort->trySendRetry();
+                queue->sendPktRetry = false;
+            }
+
+        }
+
+    }
+
+}
+
+void
+WLEngine::WLRespPort::trySendRetry()
+{
+    sendRetryReq();
+}
+
+void
+WLEngine::WLMemPort::trySendRetry()
+{
+    sendRetryResp();
+}
\ No newline at end of file
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
new file mode 100644
index 0000000000..7269965ff2
--- /dev/null
+++ b/src/accl/wl_engine.hh
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_WLE_HH__
+#define __ACCL_WLE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "base/addr_range_map.hh"
+#include "base/statistics.hh"
+#include "mem/port.hh"
+#include "mem/packet.hh"
+#include "params/MPU.hh"
+#include "sim/clocked_object.hh"
+
+class WLEngine : public ClockedObject
+{
+  private:
+
+    struct WLQueue{
+      std::queue<PacketPtr> wlQueue;
+      const uint_32 queueSize;
+      bool sendPktRetry;
+
+      bool blocked(){
+        return wlQueue.size() == queueSize;
+      }
+      bool empty(){
+        return wlQueue.empty();
+      }
+      void push(PacketPtr pkt){
+        wlQueue.push(pkt);
+      }
+
+      WLReqPort(uint32_t qSize):
+        queueSize(qSize){}
+    };
+
+    class WLRespPort : public ResponsePort //From Push engine
+    {
+      private:
+        WLEngine *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        WLRespPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+
+        virtual AddrRangeList getAddrRanges();
+        virtual bool recvTimingReq(PacketPtr pkt);
+        bool blocked(){
+          return _blocked;
+        }
+    }
+
+    class WLReqPort : public RequestPort //To Apply Engine
+    {
+      private:
+        WLEngine *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+      public:
+        WLReqPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+        void trySendRetry();
+        virtual bool recvTimingResp(PacketPtr pkt);
+        bool blocked(){
+          return _blocked;
+        }
+    }
+
+    class WLMemPort : public RequestPort
+    {
+      private:
+        WLEngine *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+      public:
+        WLMemPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+        void sendPacket(PacktPtr pkt);
+        virtual bool recvTimingResp(PacketPtr pkt);
+        void trySendRetry();
+        bool blocked(){
+          return _blocked;
+        }
+    }
+    bool handleWLU(PacketPtr pkt);
+    bool sendPacket();
+    //one queue for write and one for read a priotizes write over read
+    void readWLBuffer();
+    bool handleMemResp(PacktPtr resp);
+
+
+    //Events
+    void processNextWLReadEvent();
+    /* Syncronously checked
+       If there are any active vertecies:
+       create memory read packets + MPU::MPU::MemPortsendTimingReq
+    */
+    void processNextWLReduceEvent();
+    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
+       Perform apply and send the write request and read edgeList
+       read + write
+       Write edgelist loc in buffer
+    */
+
+    WLQueue wlReadQueue;
+    WLQueue wlWriteQueue;
+    WLMemPort memPort;
+    std::pair<Addr, int>
+   public:
+   WLEngine(const WLEngineParams &params);  //fix this
+};
+
+#endif // __ACCL_WLE_HH__
\ No newline at end of file

From af73e980a6f14878b8ad77fc6c4d7a649f3d2bcd Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 11 Feb 2022 13:06:32 -0800
Subject: [PATCH 009/247] [wip] Adding the python file to the WLE

---
 src/accl/WLEngine.py  |  39 ++++++++++++
 src/accl/wl_engine.cc | 138 ++++++++++++++++++++++++++++--------------
 src/accl/wl_engine.hh |  46 ++++++++++----
 3 files changed, 165 insertions(+), 58 deletions(-)
 create mode 100644 src/accl/WLEngine.py

diff --git a/src/accl/WLEngine.py b/src/accl/WLEngine.py
new file mode 100644
index 0000000000..fe6b25b6ba
--- /dev/null
+++ b/src/accl/WLEngine.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.SimObject import SimObject
+from m5.objects.ClockedObject import ClockedObject
+
+class WLEngine(ClockedObject):
+    type = 'WLEngine'
+    cxx_header = "accl/wl_engine.hh"
+    cxx_class = 'gem5::WLEngine'
+
+    respPort = ResponsePort("Receives updates")
+    reqPort  = RequestPort("Sends requests to Apply")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 28f8a4fe11..fbf201720d 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -33,6 +33,9 @@
 
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
+    reqPort(name() + ".reqPort", this),
+    respPort(name() + ".respPort", this),
+    memPort(name() + ".memPort", this),
     nextWLReadEvent([this]{processNextWLReadEvent; }, name()),
     nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()),
     queueSize(params.wlQueueSize) //add this to .py
@@ -41,6 +44,26 @@ WLEngine::WLEngine(const WLEngineParams &params):
     wlWriteQueue(queueSize);
 }
 
+Port &
+WLEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "reqPort") {
+        return reqPort;
+    } else if (if_name == "respPort") {
+        return respPort;
+    } else if (if_name == "memPort") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+AddrRangeList
+WLEngine::WLRespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
 bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
 {
     if (!this->handleWLUpdate(pkt)){
@@ -49,6 +72,68 @@ bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
     return true;
 }
 
+void
+WLEngine::WLRespPort::trySendRetry()
+{
+    sendRetryReq();
+}
+
+void
+WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+WLEngine::WLMemPort::recvReqRetry()
+{
+    // We should have a blocked packet if this function is called.
+    assert(_blocked && blockedPacket != nullptr);
+    _blocked = false;
+    sendPacket(blockedPacket);
+    blockedPacket = nullptr;
+}
+
+virtual bool
+WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return this->handleMemResp(pkt);
+}
+
+void
+WLEngine::WLMemPort::trySendRetry()
+{
+    sendRetryResp();
+}
+
+void
+WLEngine::WLReqPort::recvReqRetry()
+{
+    // We should have a blocked packet if this function is called.
+    assert(_blocked && blockedPacket != nullptr);
+    _blocked = false;
+    sendPacket(blockedPacket);
+    blockedPacket = nullptr;
+}
+
+void
+WLEngine::WLReqPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+AddrRangeList
+WLEngine::getAddrRanges() const
+{
+    return memPort.getAddrRanges();
+}
+
 bool WLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = wlReadQueue;
     if (queue->blocked()){
@@ -63,14 +148,14 @@ bool WLEngine::handleWLUpdate(PacketPtr pkt){
     return true;
 }
 
-
 void WLEngine::processNextWLReadEvent(){
     auto queue = wlReadQueue;
     memPort = WLMemPort
     while(!queue.empty()){ //create a map instead of front
         auto pkt = queue.front()
         /// conver to ReadReq
-        RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+        RequestPtr req =
+            std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
         PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
         if (!memPort->blocked()){
             memPort->sendPacket(memPkt);
@@ -80,37 +165,10 @@ void WLEngine::processNextWLReadEvent(){
 
 }
 
-void
-WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
-{
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-void
-WLEngine::WLMemPort::recvReqRetry()
-{
-    // We should have a blocked packet if this function is called.
-    assert(_blocked && blockedPacket != nullptr);
-    _blocked = false;
-    sendPacket(blockedPacket);
-    blockedPacket = nullptr;
-
-    owner->wakeUp(); //TODO
-}
-
-virtual bool
-WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return this->handleMemResp(pkt);
-}
-
 bool
 WLEngine::handleMemResp(PacktPtr pkt)
 {
-    auto queue = applyWriteQueue;
+    auto queue = wlWriteQueue;
         if (queue->blocked()){
             sendPktRetry = true;
             return false;
@@ -128,12 +186,11 @@ void
 WLEngine::processNextWLReduceEvent(){
     auto queue = wlWriteQueue;
     auto updateQ = wlReadQueue;
-    memPort = WLMemPort;
-    applyPort = WLReqPort;
+    applyPort = reqPort;
     while(!queue.empty()){
         auto update = updateQ.pop()
         if (!updateQ->blocked() & updateQ->sendPktRetry){
-            WLRespPort->trySendRetry();
+            respPort->trySendRetry();
             updateQ->sendPktRetry = false;
         }
         auto pkt = queue.front()
@@ -144,7 +201,8 @@ WLEngine::processNextWLReduceEvent(){
         if (*value != *prop){
             //update prop with temp_prop
             *temp_prop = min(*value , *temp_prop);
-            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            RequestPtr req =
+                std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
             PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
             writePkt->setData(data);
             if (!memPort->blocked() && !applyPort->blocked()){
@@ -171,15 +229,3 @@ WLEngine::processNextWLReduceEvent(){
     }
 
 }
-
-void
-WLEngine::WLRespPort::trySendRetry()
-{
-    sendRetryReq();
-}
-
-void
-WLEngine::WLMemPort::trySendRetry()
-{
-    sendRetryResp();
-}
\ No newline at end of file
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 7269965ff2..3f39ec7ee8 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -66,7 +66,6 @@ class WLEngine : public ClockedObject
     {
       private:
         WLEngine *owner;
-        bool _blocked;
         PacketPtr blockedPacket;
 
       public:
@@ -74,11 +73,11 @@ class WLEngine : public ClockedObject
               PortID id=InvalidPortID);
 
         virtual AddrRangeList getAddrRanges();
+        void trySendRetry();
+
+      protected:
         virtual bool recvTimingReq(PacketPtr pkt);
-        bool blocked(){
-          return _blocked;
-        }
-    }
+    };
 
     class WLReqPort : public RequestPort //To Apply Engine
     {
@@ -86,15 +85,19 @@ class WLEngine : public ClockedObject
         WLEngine *owner;
         bool _blocked;
         PacketPtr blockedPacket;
+
       public:
         WLReqPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
-        void trySendRetry();
-        virtual bool recvTimingResp(PacketPtr pkt);
+        void sendPacket(PacketPtr pkt);
         bool blocked(){
           return _blocked;
         }
-    }
+
+      protected:
+        void recvReqRetry() override;
+        virtual bool recvTimingResp(PacketPtr pkt);
+    };
 
     class WLMemPort : public RequestPort
     {
@@ -102,16 +105,21 @@ class WLEngine : public ClockedObject
         WLEngine *owner;
         bool _blocked;
         PacketPtr blockedPacket;
+
       public:
         WLMemPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
         void sendPacket(PacktPtr pkt);
-        virtual bool recvTimingResp(PacketPtr pkt);
         void trySendRetry();
         bool blocked(){
           return _blocked;
         }
-    }
+
+    protected:
+      virtual bool recvTimingResp(PacketPtr pkt);
+      void recvReqRetry() override;
+    };
+
     bool handleWLU(PacketPtr pkt);
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
@@ -131,13 +139,27 @@ class WLEngine : public ClockedObject
        read + write
        Write edgelist loc in buffer
     */
+    void processNextWLReadEvent();
+    EventFunctionWrapper nextWLReadEvent;
+
+    void processNextWLReduceEvent();
+    EventFunctionWrapper nextWLReduceEvent;
+
+    AddrRangeList getAddrRanges() const;
 
     WLQueue wlReadQueue;
     WLQueue wlWriteQueue;
     WLMemPort memPort;
-    std::pair<Addr, int>
+
+    WLMemPort memPort;
+    WLRespPort respPort;
+    WLRequestPort reqPort;
+
    public:
-   WLEngine(const WLEngineParams &params);  //fix this
+
+    WLEngine(const WLEngineParams &params);
+    Port &getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
 };
 
 #endif // __ACCL_WLE_HH__
\ No newline at end of file

From 23e3f42ae186681dedf173e0b42a20bd6b918ab2 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 13 Feb 2022 13:06:45 -0800
Subject: [PATCH 010/247] Changing some small errors

---
 src/accl/wl_engine.cc | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index fbf201720d..e49ad44bf1 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -162,7 +162,6 @@ void WLEngine::processNextWLReadEvent(){
             break;
         }
     }
-
 }
 
 bool
@@ -188,12 +187,8 @@ WLEngine::processNextWLReduceEvent(){
     auto updateQ = wlReadQueue;
     applyPort = reqPort;
     while(!queue.empty()){
-        auto update = updateQ.pop()
-        if (!updateQ->blocked() & updateQ->sendPktRetry){
-            respPort->trySendRetry();
-            updateQ->sendPktRetry = false;
-        }
-        auto pkt = queue.front()
+        auto update = updateQ.front();
+        auto pkt = queue.front();
         uint64_t* updatePtr = pkt->getPtr<uint64_t>();
         uint64_t* data = pkt->getPtr<uint64_t>();
         uint32_t* value = updatePtr;
@@ -213,6 +208,11 @@ WLEngine::processNextWLReduceEvent(){
                     memPort->trySendRetry();
                     queue->sendPktRetry = false;
                 }
+                updateQ.pop();
+                if (!updateQ->blocked() & updateQ->sendPktRetry){
+                    respPort->trySendRetry();
+                    updateQ->sendPktRetry = false;
+                }
             }
             else
                 break;
@@ -223,6 +223,11 @@ WLEngine::processNextWLReduceEvent(){
                 memPort->trySendRetry();
                 queue->sendPktRetry = false;
             }
+            updateQ.pop()
+            if (!updateQ->blocked() & updateQ->sendPktRetry){
+                respPort->trySendRetry();
+                updateQ->sendPktRetry = false;
+            }
 
         }
 

From 495fc758be9b02fa2e4d8187c57d486c70aa78e3 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 13 Feb 2022 17:39:58 -0800
Subject: [PATCH 011/247] [wip] using util in the creating memory  packets

---
 src/accl/apply.cc     | 69 ++++++++++++++++++------------
 src/accl/apply.hh     |  6 +++
 src/accl/util.cc      | 43 +++++++++++++++++++
 src/accl/util.hh      |  3 +-
 src/accl/wl_engine.cc | 97 ++++++++++++++++++++++++-------------------
 src/accl/wl_engine.hh | 10 ++++-
 6 files changed, 155 insertions(+), 73 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 6ad630f0ac..6b474d5628 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -32,6 +32,8 @@
 
 Apply::Apply(const ApplyParams &params):
     ClockedObject(params),
+    system(params.system),
+    requestorId(system->getRequestorId(this)),
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
@@ -145,20 +147,25 @@ bool Apply::handleWL(PacketPtr pkt){
 
 void Apply::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
-    while(!queue.empty()){
-        if(!memPort->blocked()){
-            auto pkt = queue.pop();
-            if(queue->sendPktRetry && !queue->blocked()){
-                    respPort->trySendRetry();
-                    queue->sendPktRetry = false;
-            }
-            // conver to ReadReq
-            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
-            PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
-            memPort->sendPacket(memPkt);
+    if(!memPort->blocked()){
+        auto pkt = queue.pop();
+        if(queue->sendPktRetry && !queue->blocked()){
+                respPort->trySendRetry();
+                queue->sendPktRetry = false;
         }
-        else
-            break;
+        // conver to ReadReq
+        Addr req_addr = (pkt->getAddr() / 64) * 64;
+        int req_offset = (pkt->getAddr()) % 64;
+        RequestPtr req = std::make_shared<Request>(req_addr, 64, 0 ,0);
+        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+        requestOffset[req] = req_offset;
+        memPort->sendPacket(memPkt);
+    }
+    else{
+        break;
+    }
+    if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
+        schedule(nextApplyCheckEvent, nextCycle());
     }
 }
 
@@ -183,21 +190,27 @@ Apply::handleMemResp(PacktPtr pkt)
 void
 Apply::processNextApplyEvent(){
     auto queue = applyWriteQueue;
-    while(!queue.empty()){
         auto pkt = queue.front();
-        uint64_t* data = pkt->getPtr<uint64_t>();
-        uint32_t* prop = data;
-        uint32_t* temp_prop = prop + 1;
-        if (*temp_prop != *prop){
-            //update prop with temp_prop
-            *prop = min(*prop , *temp_prop);
-            RequestPtr req =
-                std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
-            PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
-            writePkt->setData(data);
-            if (!memPort->blocked() && !reqPort->blocked()){ //re-think this
-                memPort->sendPacket(pkt);
-                applyReqPort->sendPacket(pkt);
+        uint8_t* data = pkt->getPtr<uint8_t>();
+
+        RequestPtr req = pkt->req;
+        int request_offset = requestOffset[req];
+        WorkListItem wl = memoryToWorkList(data + request_offset);
+        uint32_t prop = wl.prop;
+        uint32_t temp_prop = wl.temp_prop;
+
+        if (temp_prop != prop){
+            if (!memPort->blocked() && !reqPort->blocked()){
+                //update prop with temp_prop
+                wl.prop = min(prop , temp_prop);
+                //write back the new worklist item to  memory
+                uint8_t* wList = workListToMemory(wl);
+                memcpy(data + request_offset, wList, sizeof(WorkListItem));
+                //Create memory write requests.
+                PacketPtr writePkt  =
+                getWritePacket(pkt->getAddr(), 64, data, requestorId);
+                memPort->sendPacket(writePkt);
+                applyReqPort->sendPacket(writePkt);
                 queue.pop();
                 if(queue->sendPktRetry && !queue->blocked()){
                     memPort->trySendRetry();
@@ -214,5 +227,7 @@ Apply::processNextApplyEvent(){
                 queue->sendPktRetry = false;
             }
         }
+    if(!queue.empty() && !nextApplyEvent.scheduled()){
+        schedule(nextApplyEvent, nextCycle());
     }
 }
\ No newline at end of file
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index dae3d8ec0e..b213d37667 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -32,6 +32,7 @@
 #include <queue>
 #include <unordered_map>
 
+#include "accl/util.hh"
 #include "base/addr_range_map.hh"
 #include "base/statistics.hh"
 #include "mem/port.hh"
@@ -146,6 +147,9 @@ class Apply : public ClockedObject
     void processNextApplyCheckEvent();
     EventFunctionWrapper nextApplyCheckEvent;
 
+    System* const system;
+    const RequestorID requestorId;
+
     AddrRangeList getAddrRanges() const;
 
     ApplyQueue applyReadQueue;
@@ -155,6 +159,8 @@ class Apply : public ClockedObject
     ApplyRespPort respPort;
     ApplyRequestPort reqPort;
 
+    std::unordered_map<RequestPtr, int> requestOffset;
+
   public:
     Apply(const ApplyParams &apply);
     Port &getPort(const std::string &if_name,
diff --git a/src/accl/util.cc b/src/accl/util.cc
index 8d975c482f..8debd3a937 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -42,3 +42,46 @@ getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
 
     return pkt;
 }
+
+PacketPtr getWritePacket(Addr addr,
+               unsigned int size,
+               uint8_t* data,
+               RequestorID requestorId)
+{
+    equestPtr req = std::make_shared<Request>(addr, size, 0,
+                                               requestorId);
+    req->setPC(((Addr)requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
+WorkListItem&
+memoryToWorkList(uint8_t* data){
+    WorkListItem wl;
+    uint32_t temp_prop = *((uint32_t*) data));
+
+    uint32_t prop = *((uint32_t*) (data + 4));
+
+    uint32_t degree = *((uint32_t*) (data + 8));
+
+    uint32_t addr = *((uint32_t*) (data + 12));
+
+    retrun wl  = {temp_prop, prop, degree, addr};
+}
+
+unit8_t*
+workListToMemory(WorkListItem wl){
+    int  data_size = sizeof(WorkListItem)/sizeof(uint_8)
+    uint_8* data = new uint8_t [data_size];
+    uint_32* wList = (uint_32*)data;
+    *wList = wl.prop;
+    *wList + 1 = wl.temp_prop;
+    *wList + 2 = wl.degree;
+    *wList + 3 = wl.edgeIndex;
+
+    return data;
+}
\ No newline at end of file
diff --git a/src/accl/util.hh b/src/accl/util.hh
index 18b8e4c197..00ccb7ddd9 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -34,7 +34,7 @@ struct WorkListItem
     uint32_t temp_prop;
     uint32_t prop;
     uint32_t degree;
-    Addr edgeList;
+    uint32_t edgeIndex;
 }
 
 struct Edge
@@ -44,6 +44,7 @@ struct Edge
 }
 
 WorkListItem& memoryToWorkList(uint8_t* data);
+unit8_t* workListToMemory(WorkListItem wl);
 Edge& memoryToEdge(uint8_t* data);
 
 PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId);
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index e49ad44bf1..7d6d707ae6 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -33,6 +33,8 @@
 
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
+    system(params.system),
+    requestorId(system->getRequestorId(this)),
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
@@ -40,8 +42,8 @@ WLEngine::WLEngine(const WLEngineParams &params):
     nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()),
     queueSize(params.wlQueueSize) //add this to .py
 {
-    wlReadQueue(queueSize);
-    wlWriteQueue(queueSize);
+    updateQueue(queueSize);
+    responseQueue(queueSize);
 }
 
 Port &
@@ -135,7 +137,7 @@ WLEngine::getAddrRanges() const
 }
 
 bool WLEngine::handleWLUpdate(PacketPtr pkt){
-    auto queue = wlReadQueue;
+    auto queue = updateQueue;
     if (queue->blocked()){
         queue->sendPktRetry = true;
         return false;
@@ -149,25 +151,32 @@ bool WLEngine::handleWLUpdate(PacketPtr pkt){
 }
 
 void WLEngine::processNextWLReadEvent(){
-    auto queue = wlReadQueue;
+    auto queue = updateQueue;
     memPort = WLMemPort
     while(!queue.empty()){ //create a map instead of front
         auto pkt = queue.front()
         /// conver to ReadReq
+        Addr req_addr = (pkt->getAddr() / 64) * 64;
+        int req_offset = (pkt->getAddr()) % 64;
         RequestPtr req =
-            std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            std::make_shared<Request>(req_addr, 64, 0 ,0);
         PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+        requestOffset[req] = req_offset;
         if (!memPort->blocked()){
+            queue.pop()
             memPort->sendPacket(memPkt);
             break;
         }
     }
+    if(!queue.empty() && !nextWLReadEvent.scheduled()){
+        schedule(nextWLReadEvent, nextCycle());
+    }
 }
 
 bool
 WLEngine::handleMemResp(PacktPtr pkt)
 {
-    auto queue = wlWriteQueue;
+    auto queue = responseQueue;
         if (queue->blocked()){
             sendPktRetry = true;
             return false;
@@ -183,54 +192,56 @@ WLEngine::handleMemResp(PacktPtr pkt)
 
 void
 WLEngine::processNextWLReduceEvent(){
-    auto queue = wlWriteQueue;
-    auto updateQ = wlReadQueue;
+    auto queue = responseQueue;
+    auto updateQ = updateQueue;
     applyPort = reqPort;
-    while(!queue.empty()){
-        auto update = updateQ.front();
-        auto pkt = queue.front();
-        uint64_t* updatePtr = pkt->getPtr<uint64_t>();
-        uint64_t* data = pkt->getPtr<uint64_t>();
-        uint32_t* value = updatePtr;
-        uint32_t* temp_prop = prop + 1;
-        if (*value != *prop){
-            //update prop with temp_prop
-            *temp_prop = min(*value , *temp_prop);
-            RequestPtr req =
-                std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
-            PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
-            writePkt->setData(data);
-            if (!memPort->blocked() && !applyPort->blocked()){
-                memPort->sendPacket(pkt);
-                applyPort->sendPacket(pkt);
-                queue.pop();
-                if (!queue->blocked() && queue->sendPktRetry){
-                    memPort->trySendRetry();
-                    queue->sendPktRetry = false;
-                }
-                updateQ.pop();
-                if (!updateQ->blocked() & updateQ->sendPktRetry){
-                    respPort->trySendRetry();
-                    updateQ->sendPktRetry = false;
-                }
-            }
-            else
-                break;
-        }
-        else{
+    auto update = updateQ.front();
+    auto value = update->getPtr<uint8_t>();
+    auto pkt = queue.front();
+    uint8_t* data = pkt->getPtr<uint8_t>();
+    RequestPtr req = pkt->req;
+    int request_offset = requestOffset[req];
+    WorkListItem wl =  memoryToWorkList(data + request_offset)
+    uint32_t temp_prop = wl.temp_prop;
+    if (temp_prop != *value){
+        //update prop with temp_prop
+        temp_prop = min(value , temp_prop);
+        if (!memPort->blocked() && !applyPort->blocked()){
+            wl.temp_prop = temp_prop;
+            unit8_t* wlItem = workListToMemory(wl);
+            memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
+            PacketPtr writePkt  =
+            getWritePacket(pkt->getAddr(), 64, data, requestorId);
+            memPort->sendPacket(writePkt);
+            applyPort->sendPacket(writePkt);
             queue.pop();
             if (!queue->blocked() && queue->sendPktRetry){
                 memPort->trySendRetry();
                 queue->sendPktRetry = false;
             }
-            updateQ.pop()
+            updateQ.pop();
             if (!updateQ->blocked() & updateQ->sendPktRetry){
                 respPort->trySendRetry();
                 updateQ->sendPktRetry = false;
             }
-
         }
-
+        else
+            break;
     }
+    else{
+        queue.pop();
+        if (!queue->blocked() && queue->sendPktRetry){
+            memPort->trySendRetry();
+            queue->sendPktRetry = false;
+        }
+        updateQ.pop()
+        if (!updateQ->blocked() & updateQ->sendPktRetry){
+            respPort->trySendRetry();
+            updateQ->sendPktRetry = false;
+        }
 
+    }
+    if(!queue && !nextWLReduceEvent.scheduled()){
+            schedule(nextWLReduceEvent, nextCycle());
+    }
 }
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 3f39ec7ee8..7132283463 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -32,6 +32,7 @@
 #include <queue>
 #include <unordered_map>
 
+#include "accl/util.hh"
 #include "base/addr_range_map.hh"
 #include "base/statistics.hh"
 #include "mem/port.hh"
@@ -39,6 +40,7 @@
 #include "params/MPU.hh"
 #include "sim/clocked_object.hh"
 
+
 class WLEngine : public ClockedObject
 {
   private:
@@ -145,10 +147,14 @@ class WLEngine : public ClockedObject
     void processNextWLReduceEvent();
     EventFunctionWrapper nextWLReduceEvent;
 
+    System* const system;
+    const RequestorID requestorId;
+    std::unordered_map<RequestPtr, int> requestOffset;
+
     AddrRangeList getAddrRanges() const;
 
-    WLQueue wlReadQueue;
-    WLQueue wlWriteQueue;
+    WLQueue updateQueue;
+    WLQueue responseQueue;
     WLMemPort memPort;
 
     WLMemPort memPort;

From 394ffeb71c32901ae564babeadbcd5b6883fb5e5 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Feb 2022 21:15:35 -0800
Subject: [PATCH 012/247] Completing PushEngine.

---
 src/accl/push_engine.cc | 174 ++++++++++++++++++++++++++++++----------
 src/accl/push_engine.hh |  24 ++++--
 src/accl/util.cc        |  43 +++++++++-
 src/accl/util.hh        |   6 +-
 src/mem/packet.hh       |   2 +
 5 files changed, 196 insertions(+), 53 deletions(-)

diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index bc3138f61e..cd5f73eea3 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -26,26 +26,25 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "accl/util.hh"
 #include "accl/push_engine.hh"
-
 #include "debug/PushEngine.hh"
 
-PushEngine::PushEngine(const PushEngineParams& params):
-    ClockedObject(params),
+PushEngine::PushEngine(const PushEngineParams &params) : ClockedObject(params),
     system(params.system),
     requestorId(system->getRequestorId(this)),
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
-    vertexQueueSize(params.vertex_queue_size),
-    vertexQueueLen(0),
-    updateQueue(params.update_queue_size),
-    updateQueueLen(0),
-    nextReceiveEvent([this]{ processNextReceiveEvent(); }, name()),
-    nextReadEvent([this]{ processNextReadEvent(); }, name()),
-    nextCreateEvent([this]{ processNextCreateEvent(); }, name()),
-    nextSendEvent([this]{ processNextSendEvent(); }, name())
-{}
+    // vertexQueueSize(params.vertex_queue_size),
+    // vertexQueueLen(0),
+    // updateQueue(params.update_queue_size),
+    // updateQueueLen(0),
+    nextReceiveEvent([this] { processNextReceiveEvent(); }, name()),
+    nextReadEvent([this] { processNextReadEvent(); }, name()),
+    nextSendEvent([this] { processNextSendEvent(); }, name())
+{
+}
 
 Port &
 PushEngine::getPort(const std::string &if_name, PortID idx)
@@ -61,60 +60,151 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
-bool
-PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
+bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
 {
     return owner->handleUpdate(pkt);
 }
 
-bool
-PushEngine::handleUpdate(PacketPtr pkt)
+AddrRangeList
+PushEngine::PushRespPort::getAddrRanges()
 {
-    if (vertexQueueLen < vertexQueueSize) {
-        vertexQueue.push(pkt)
-        vertexQueueLen++;
-        return true;
+    owner->memPort->getAddrRanges();
+}
 
-        if (!nextReceiveEvent.scheduled()){
-            schedule(nextReceiveEvent, nextCycle());
-        }
+bool PushEngine::handleUpdate(PacketPtr pkt)
+{
+    // if (vertexQueueLen < vertexQueueSize) {
+    //     vertexQueue.push(pkt)
+    //         vertexQueueLen++;
+    //     if (!nextReceiveEvent.scheduled()) {
+    //         schedule(nextReceiveEvent, nextCycle());
+    //     }
+    //     return true;
+    // }
+    // return false;
+    vertexQueue.push(pkt)
+    if (!nextReceiveEvent.scheduled()) {
+        schedule(nextReceiveEvent, nextCycle());
     }
-    return false;
+    return true;
 }
 
-void
-PushEngine::processNextReceiveEvent()
+void PushEngine::processNextReceiveEvent()
 {
     PacketPtr updatePkt = vertexQueue.pop();
-    uint8_t* data = updatePkt->getData<uint8_t>();
-
-    Addr edgeListAddr = ; // TODO: Generalize finding this address.
-    int outDegree = ; // TODO: Generalize finding this value.
-
-    Addr reqAddr = (edgeListAddr / 64) * 64;
-    Addr offsetAddr = edgeListAddr % 64;
+    uint8_t *data = updatePkt->getData<uint8_t>();
+
+    // data: (edge_index: 32 bits, degree: 32 bits, value: 32 bits)
+    uint32_t edge_index = *((uint32_t *)data);
+    uint32_t degree = *((uint32_t *)(data + 4));
+    uint32_t value = *((uint32_t *)(data + 8));
+
+    std::vector<Addr> addr_queue;
+    std::vector<Addr> offset_queue;
+    std::vector<int> num_edge_queue;
+
+    for (uint32_t index = 0; index < degree; index++) {
+        Addr edge_addr = (edge_index + index) * sizeof(Edge);
+        Addr req_addr = (edge_addr / 64) * 64;
+        Addr req_offset = edge_addr % 64;
+        if (addr_queue.size()) {
+            if (addr_queue.back() == req_addr) {
+                num_edge_queue.back()++;
+            }
+            else {
+                addr_queue.push(req_addr);
+                offset_queue.push(req_offset);
+                num_edge_queue.push(1);
+            }
+        }
+        else {
+            addr_queue.push(req_addr);
+            offset_queue.push(req_offset);
+            num_edge_queue.push(1);
+        }
+    }
 
-    PacketPtr pkt = getReadPacket(reqAddr, 64, requestorId);
+    for (int index = 0; index < addr_queue.size(); inedx++) {
+        PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
+        memReqQueue.push(pkt);
+        reqOffsetMap[pkt->req] = offset_queue[index];
+        reqNumEdgeMap[pkt->req] = num_edge_queue[index];
+        reqValueMap[pkt->req] = value;
+    }
 
-    memPort.sendPacket(pkt);
+    if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
+        schedule(nextReadEvent, nextCycle());
+    }
+}
 
+void PushEngine::processNextReadEvent()
+{
+    PacketPtr pkt = memReqQueue.front();
+    if (!memPort.blocked()) {
+        memPort.sendPacket(pkt);
+        memReqQueue.pop();
+    }
 
+    if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
+        schedule(nextReadEvent, nextCycle());
+    }
 }
 
-void
-PushEngine::processNextReadEvent()
+bool PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt)
 {
+    return owner->handleMemResp(pkt);
+}
 
+void PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        DPRINTF(MemScheduler, "Setting blocked to true on port %s\n",
+                this->name());
+        _blocked = true;
+    }
 }
 
-void
-PushEngine::processNextCreateEvent()
+void PushEngine::handleMemResp(PacketPtr pkt)
 {
+    RequestPtr req = pkt->req;
+    uint8_t *data = pkt->getPtr<uint8_t>();
+
+    Addr offset = reqOffsetMap[req];
+    int num_edges = reqNumEdgeMap[req];
+    uint32_t value = reqValueMap[req];
+
+    int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
+    for (int i = 0; i < num_edges; i++) {
+        uint8_t *curr_edge_data = data + offset + i * edge_in_bytes;
+        Edge e = memoryToEdge(curr_edge_data);
+        uint32_t *update_data = new uint32_t;
+
+        // TODO: Implement propagate function here
+        *update_data = value + 1;
+        PacketPtr update = getUpdatePacket(e.neighbor,
+            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
+        updateQueue.push(update);
+    }
 
+    if (!nextSendEvent.scheduled() && !updateQueue.empty()) {
+        schedule(nextSendEvent, nextCycle());
+    }
 }
 
-void
-PushEngine::processNextSendEvent()
+
+void PushEngine::processNextSendEvent()
 {
+    PacketPtr pkt = updateQueue.front();
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(pkt);
+        updateQueue.pop();
+    }
 
-}
\ No newline at end of file
+    if (!nextSendEvent.scheduled() && !updateQueue.empty()) {
+        schedule(nextSendEvent, nextCycle());
+    }
+}
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 6ab902d0e2..a746dcc265 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -51,6 +51,7 @@ class PushEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
+        //TODO: Implement this;
         PushRespPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
 
@@ -65,6 +66,7 @@ class PushEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
+        // TODO: Implement this;
         PushReqPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
 
@@ -78,9 +80,12 @@ class PushEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
+        // TODO: Implement this;
         PushMemPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
-        bool sendPacket(PacktPtr pkt);
+
+        void sendPacket(PacktPtr pkt);
+        bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
     }
 
@@ -93,12 +98,18 @@ class PushEngine : public ClockedObject
     PushMemPort memPort;
 
     std::queue<PacketPtr> vertexQueue;
-    int vertexQueueSize;
-    int vertexQueueLen;
+    // int vertexQueueSize;
+    // int vertexQueueLen;
+
+    std::unordered_map<req, Addr> reqOffsetMap;
+    std::unordered_map<req, int> reqNumEdgeMap;
+    std::unordered_map<req, uint32_t> reqValueMap;
+
+    std::queue<PacketPtr> memReqQueue; // Infinite queueing?
 
     std::queue<PacketPtr> updateQueue;
-    int updateQueueSize;
-    int updateQueueLen;
+    // int updateQueueSize;
+    // int updateQueueLen;
 
     EventFunctionWrapper nextReceiveEvent;
     void processNextReceiveEvent();
@@ -106,9 +117,6 @@ class PushEngine : public ClockedObject
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
-    EventFunctionWrapper nextCreateEvent;
-    void processNextCreateEvent();
-
     EventFunctionWrapper nextSendEvent;
     void processNextSendEvent();
 
diff --git a/src/accl/util.cc b/src/accl/util.cc
index 8debd3a937..76ed6269c2 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -28,6 +28,34 @@
 
 #include "accl/util.hh"
 
+
+// Edge: (weight: 64 bits, neighbor: 64 bits)
+Edge&
+memoryToEdge(uint8_t *data)
+{
+    uint64_t weight = *((uint64_t*) data);
+    Addr neighbor = *((Addr*) (data + 8)); // data + 8 because weight: 8 bytes
+    Edge e = {weight, neighbor};
+    return e;
+}
+
+// Edge: (weight: 64 bits, neighbor: 64 bits)
+uint8_t*
+edgeToMemory(Edge e)
+{
+    int data_size = (int) ((sizeof(Edge)) / (sizeof(uint8_t)));
+
+    uint8_t* data = new uint8_t [data_size];
+
+    uint64_t* weightPtr = (uint64_t*) data;
+    *weightPtr = e.weight;
+
+    Addr* neighborPtr = (Addr*) (data + 8); // data + 8 because weight: 8 bytes
+    *neighborPtr = e.neighbor;
+
+    return data;
+}
+
 PacketPtr
 getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
 {
@@ -43,6 +71,7 @@ getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
     return pkt;
 }
 
+
 PacketPtr getWritePacket(Addr addr,
                unsigned int size,
                uint8_t* data,
@@ -53,6 +82,18 @@ PacketPtr getWritePacket(Addr addr,
     req->setPC(((Addr)requestorId) << 2);
 
     PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+
+PacketPtr
+getUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0,
+                                               requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr)requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
+
     pkt->allocate();
     pkt->setData(data);
 
@@ -84,4 +125,4 @@ workListToMemory(WorkListItem wl){
     *wList + 3 = wl.edgeIndex;
 
     return data;
-}
\ No newline at end of file
+}
diff --git a/src/accl/util.hh b/src/accl/util.hh
index 00ccb7ddd9..c309d4967a 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -39,13 +39,15 @@ struct WorkListItem
 
 struct Edge
 {
-    uint32_t weight;
+    uint64_t weight;
     Addr neighbor;
 }
 
 WorkListItem& memoryToWorkList(uint8_t* data);
 unit8_t* workListToMemory(WorkListItem wl);
+
 Edge& memoryToEdge(uint8_t* data);
+uint8_t* edgeToMemory(Edge e);
 
 PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId);
-PacketPtr getWritePacket(Addr addr, unsigned int size, uint8_t* data, RequestorID requestorId);
+PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
\ No newline at end of file
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 9238dbec00..5332ee32a2 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -148,6 +148,8 @@ class MemCmd
         HTMAbort,
         // Tlb shootdown
         TlbiExtSync,
+        // MPU Accelerator
+        UpdateWL,
         NUM_MEM_CMDS
     };
 

From a13dcdb4c82d5a6d75eede265f42364ddb13f01a Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 14 Feb 2022 10:20:19 -0800
Subject: [PATCH 013/247] arch: Accelerator

[wip] Adding Sconscript, debugging

Change-Id: I0cef6e8745ca8f58a17a01d71dfb090fe1a7e606
---
 src/accl/PushEngine.py  | 39 ++++++++++++++++++++++
 src/accl/SConscript     | 36 ++++++++++++++++++++
 src/accl/apply.cc       | 74 +++++++++++++++++++----------------------
 src/accl/apply.hh       | 24 +++++++++----
 src/accl/push_engine.cc |  2 +-
 src/accl/util.cc        |  2 ++
 src/accl/util.hh        |  7 ++--
 src/accl/wl_engine.cc   | 71 +++++++++++++++++++--------------------
 src/accl/wl_engine.hh   | 20 +++++++----
 9 files changed, 180 insertions(+), 95 deletions(-)
 create mode 100644 src/accl/PushEngine.py
 create mode 100644 src/accl/SConscript

diff --git a/src/accl/PushEngine.py b/src/accl/PushEngine.py
new file mode 100644
index 0000000000..37639377c1
--- /dev/null
+++ b/src/accl/PushEngine.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.SimObject import SimObject
+from m5.objects.ClockedObject import ClockedObject
+
+class PushEngine(ClockedObject):
+    type = 'PushEngine'
+    cxx_header = "accl/push_engine.hh"
+    cxx_class = 'gem5::PushEngine'
+
+    respPort = ResponsePort("Receives requests from WorkList")
+    reqPort  = RequestPort("Sends requests to Push")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/SConscript b/src/accl/SConscript
new file mode 100644
index 0000000000..da0774ca44
--- /dev/null
+++ b/src/accl/SConscript
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import('*')
+
+SimObject('WLEngine.py')
+# SimObject('Apply.py')
+# SimObject('PushEngine.py')
+
+# Source('apply.cc')
+Source('wl_engine.cc')
+# Source('push_engine.cc')
diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 6b474d5628..985e6217d7 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -38,11 +38,10 @@ Apply::Apply(const ApplyParams &params):
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
     nextApplyEvent([this]{processNextApplyEvent; }, name()),
-    nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name()),
-    queueSize(params.applyQueueSize) //add this to .py
+    nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name())
 {
-    applyReadQueue(queueSize);
-    applyWriteQueue(queueSize);
+    applyReadQueue(params.applyQueueSize);
+    applyWriteQueue(params.applyQueueSize);
 }
 
 Port &
@@ -110,7 +109,7 @@ Apply::ApplyMemPort::recvReqRetry()
 }
 
 void
-WLEngine::ApplyReqPort::sendPacket(PacketPtr pkt)
+Apply::ApplyReqPort::sendPacket(PacketPtr pkt)
 {
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
@@ -119,7 +118,7 @@ WLEngine::ApplyReqPort::sendPacket(PacketPtr pkt)
 }
 
 void
-Apply::ApplyReqtPort::recvReqRetry()
+Apply::ApplyReqPort::recvReqRetry()
 {
     _blocked = false;
     sendPacket(blockedPacket);
@@ -134,12 +133,13 @@ Apply::getAddrRanges() const
 
 bool Apply::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
-    if (queue->blocked()){
+    if (queue.blocked()){
         sendPktRetry = true;
         return false;
-    } else
-        queue->push(pkt);
-    if(!nextApplyCheckEvent.scheduled()){
+    } else{
+        queue.push(pkt);
+    }
+    if (!nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
     }
     return true;
@@ -147,22 +147,19 @@ bool Apply::handleWL(PacketPtr pkt){
 
 void Apply::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
-    if(!memPort->blocked()){
+    if (!memPort.blocked()){
         auto pkt = queue.pop();
-        if(queue->sendPktRetry && !queue->blocked()){
-                respPort->trySendRetry();
-                queue->sendPktRetry = false;
+        if (queue.sendPktRetry && !queue.blocked()){
+                respPort.trySendRetry();
+                queue.sendPktRetry = false;
         }
         // conver to ReadReq
         Addr req_addr = (pkt->getAddr() / 64) * 64;
         int req_offset = (pkt->getAddr()) % 64;
-        RequestPtr req = std::make_shared<Request>(req_addr, 64, 0 ,0);
-        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
-        requestOffset[req] = req_offset;
-        memPort->sendPacket(memPkt);
-    }
-    else{
-        break;
+        RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
+        PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
+        requestOffset[request] = req_offset;
+        memPort.sendPacket(memPkt);
     }
     if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
@@ -174,11 +171,11 @@ Apply::handleMemResp(PacktPtr pkt)
 {
     auto queue = applyWriteQueue;
 
-        if (queue->blocked()){
+        if (queue.blocked()){
             sendPktRetry = true;
             return false;
         } else
-            queue->push(writePkt);
+            queue.push(pkt);
 
         if(!nextApplyEvent.scheduled()){
             schedule(nextApplyEvent, nextCycle());
@@ -193,41 +190,38 @@ Apply::processNextApplyEvent(){
         auto pkt = queue.front();
         uint8_t* data = pkt->getPtr<uint8_t>();
 
-        RequestPtr req = pkt->req;
-        int request_offset = requestOffset[req];
+        RequestPtr request = pkt->req;
+        int request_offset = requestOffset[request];
         WorkListItem wl = memoryToWorkList(data + request_offset);
         uint32_t prop = wl.prop;
         uint32_t temp_prop = wl.temp_prop;
 
         if (temp_prop != prop){
-            if (!memPort->blocked() && !reqPort->blocked()){
+            if (!memPort.blocked() && !reqPort.blocked()){
                 //update prop with temp_prop
-                wl.prop = min(prop , temp_prop);
+                wl.prop = std::min(prop , temp_prop);
                 //write back the new worklist item to  memory
                 uint8_t* wList = workListToMemory(wl);
                 memcpy(data + request_offset, wList, sizeof(WorkListItem));
                 //Create memory write requests.
                 PacketPtr writePkt  =
                 getWritePacket(pkt->getAddr(), 64, data, requestorId);
-                memPort->sendPacket(writePkt);
-                applyReqPort->sendPacket(writePkt);
+                memPort.sendPacket(writePkt);
+                applyReqPort.sendPacket(writePkt);
                 queue.pop();
-                if(queue->sendPktRetry && !queue->blocked()){
-                    memPort->trySendRetry();
-                    queue->sendPktRetry = false;
+                if (queue.sendPktRetry && !queue.blocked()){
+                    memPort.trySendRetry();
+                    queue.sendPktRetry = false;
                 }
             }
-            else
-                break;
-        }
-        else{
+        }else{
             queue.pop();
-            if(queue->sendPktRetry && !queue->blocked()){
-                memPort->trySendRetry();
-                queue->sendPktRetry = false;
+            if (queue.sendPktRetry && !queue.blocked()){
+                memPort.trySendRetry();
+                queue.sendPktRetry = false;
             }
         }
     if(!queue.empty() && !nextApplyEvent.scheduled()){
         schedule(nextApplyEvent, nextCycle());
     }
-}
\ No newline at end of file
+}
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index b213d37667..f4dabd6a97 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -35,10 +35,12 @@
 #include "accl/util.hh"
 #include "base/addr_range_map.hh"
 #include "base/statistics.hh"
-#include "mem/port.hh"
+#include "base/types.hh"
 #include "mem/packet.hh"
-#include "params/MPU.hh"
+#include "mem/port.hh"
+#include "params/Apply.hh"
 #include "sim/clocked_object.hh"
+#include "sim/port.hh"
 
 class Apply : public ClockedObject
 {
@@ -46,17 +48,25 @@ class Apply : public ClockedObject
 
     struct ApplyQueue{
       std::queue<PacketPtr> applyQueue;
-      const uint_32 queueSize;
+      const uint32_t queueSize;
       bool sendPktRetry;
 
       bool blocked(){
-        return applyQueue.size() == queueSize;
+        return (applyQueue.size() == queueSize);
       }
       bool empty(){
-        return applyQueue.empty();
+        return applyQueue->empty();
       }
       void push(PacketPtr pkt){
-        applyQueue.push(pkt);
+        applyQueue->push(pkt);
+      }
+
+      void pop(){
+        applyQueue->pop();
+      }
+
+      void front(){
+        applyQueue->front();
       }
 
       ApplyQueue(uint32_t qSize):
@@ -167,4 +177,4 @@ class Apply : public ClockedObject
                   PortID idx=InvalidPortID) override;
 };
 
-#endif // __ACCL_APPLY_HH__
\ No newline at end of file
+#endif // __ACCL_APPLY_HH__
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index cd5f73eea3..c02009d25a 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -28,7 +28,7 @@
 
 #include "accl/util.hh"
 #include "accl/push_engine.hh"
-#include "debug/PushEngine.hh"
+// #include "debug/PushEngine.hh"
 
 PushEngine::PushEngine(const PushEngineParams &params) : ClockedObject(params),
     system(params.system),
diff --git a/src/accl/util.cc b/src/accl/util.cc
index 76ed6269c2..92f6a3e351 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -28,6 +28,8 @@
 
 #include "accl/util.hh"
 
+#include "base/types.hh"
+#include "mem/packet.hh"
 
 // Edge: (weight: 64 bits, neighbor: 64 bits)
 Edge&
diff --git a/src/accl/util.hh b/src/accl/util.hh
index c309d4967a..737d52e2a1 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -26,6 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "base/addr_range_map.hh"
 #include "base/types.hh"
 #include "mem/packet.hh"
 
@@ -35,7 +36,7 @@ struct WorkListItem
     uint32_t prop;
     uint32_t degree;
     uint32_t edgeIndex;
-}
+};
 
 struct Edge
 {
@@ -44,10 +45,10 @@ struct Edge
 }
 
 WorkListItem& memoryToWorkList(uint8_t* data);
-unit8_t* workListToMemory(WorkListItem wl);
+uint8_t* workListToMemory(WorkListItem wl);
 
 Edge& memoryToEdge(uint8_t* data);
 uint8_t* edgeToMemory(Edge e);
 
 PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId);
-PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
\ No newline at end of file
+PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 7d6d707ae6..757bdd2598 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -39,11 +39,10 @@ WLEngine::WLEngine(const WLEngineParams &params):
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
     nextWLReadEvent([this]{processNextWLReadEvent; }, name()),
-    nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()),
-    queueSize(params.wlQueueSize) //add this to .py
+    nextWLReduceEvent([this]{processNextWLReduceEvent; }, name())
 {
-    updateQueue(queueSize);
-    responseQueue(queueSize);
+    updateQueue(params.wlQueueSize);
+    responseQueue(params.wlQueueSize);
 }
 
 Port &
@@ -138,11 +137,11 @@ WLEngine::getAddrRanges() const
 
 bool WLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
-    if (queue->blocked()){
-        queue->sendPktRetry = true;
+    if (queue.blocked()){
+        queue.sendPktRetry = true;
         return false;
     } else
-        queue->push(pkt);
+        queue.push(pkt);
 
     if(!nextWLReadEvent.scheduled()){
         schedule(nextWLReadEvent, nextCycle());
@@ -152,19 +151,19 @@ bool WLEngine::handleWLUpdate(PacketPtr pkt){
 
 void WLEngine::processNextWLReadEvent(){
     auto queue = updateQueue;
-    memPort = WLMemPort
-    while(!queue.empty()){ //create a map instead of front
+    auto memPort = WLMemPort;
+    while (!queue.empty()){ //create a map instead of front
         auto pkt = queue.front()
         /// conver to ReadReq
         Addr req_addr = (pkt->getAddr() / 64) * 64;
         int req_offset = (pkt->getAddr()) % 64;
-        RequestPtr req =
+        RequestPtr request =
             std::make_shared<Request>(req_addr, 64, 0 ,0);
         PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
-        requestOffset[req] = req_offset;
-        if (!memPort->blocked()){
+        requestOffset[request] = req_offset;
+        if (!memPort.blocked()){
             queue.pop()
-            memPort->sendPacket(memPkt);
+            memPort.sendPacket(memPkt);
             break;
         }
     }
@@ -177,11 +176,11 @@ bool
 WLEngine::handleMemResp(PacktPtr pkt)
 {
     auto queue = responseQueue;
-        if (queue->blocked()){
+        if (queue.blocked()){
             sendPktRetry = true;
             return false;
         } else
-            queue->push(writePkt);
+            queue.push(writePkt);
 
         if(!nextWLReduceEvent.scheduled()){
             schedule(nextWLReduceEvent, nextCycle());
@@ -199,49 +198,47 @@ WLEngine::processNextWLReduceEvent(){
     auto value = update->getPtr<uint8_t>();
     auto pkt = queue.front();
     uint8_t* data = pkt->getPtr<uint8_t>();
-    RequestPtr req = pkt->req;
-    int request_offset = requestOffset[req];
+    RequestPtr request = pkt->req;
+    int request_offset = requestOffset[request];
     WorkListItem wl =  memoryToWorkList(data + request_offset)
     uint32_t temp_prop = wl.temp_prop;
     if (temp_prop != *value){
         //update prop with temp_prop
-        temp_prop = min(value , temp_prop);
-        if (!memPort->blocked() && !applyPort->blocked()){
+        temp_prop = std::min(value , temp_prop);
+        if (!memPort.blocked() && !applyPort.blocked()){
             wl.temp_prop = temp_prop;
-            unit8_t* wlItem = workListToMemory(wl);
+            uint8_t* wlItem = workListToMemory(wl);
             memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
             PacketPtr writePkt  =
             getWritePacket(pkt->getAddr(), 64, data, requestorId);
-            memPort->sendPacket(writePkt);
-            applyPort->sendPacket(writePkt);
+            memPort.sendPacket(writePkt);
+            applyPort.sendPacket(writePkt);
             queue.pop();
-            if (!queue->blocked() && queue->sendPktRetry){
-                memPort->trySendRetry();
-                queue->sendPktRetry = false;
+            if (!queue.blocked() && queue.sendPktRetry){
+                memPort.trySendRetry();
+                queue.sendPktRetry = false;
             }
             updateQ.pop();
-            if (!updateQ->blocked() & updateQ->sendPktRetry){
-                respPort->trySendRetry();
-                updateQ->sendPktRetry = false;
+            if (!updateQ.blocked() & updateQ.sendPktRetry){
+                respPort.trySendRetry();
+                updateQ.sendPktRetry = false;
             }
         }
-        else
-            break;
     }
     else{
         queue.pop();
-        if (!queue->blocked() && queue->sendPktRetry){
-            memPort->trySendRetry();
-            queue->sendPktRetry = false;
+        if (!queue.blocked() && queue.sendPktRetry){
+            memPort.trySendRetry();
+            queue.sendPktRetry = false;
         }
         updateQ.pop()
-        if (!updateQ->blocked() & updateQ->sendPktRetry){
-            respPort->trySendRetry();
-            updateQ->sendPktRetry = false;
+        if (!updateQ.blocked() & updateQ.sendPktRetry){
+            respPort.trySendRetry();
+            updateQ.sendPktRetry = false;
         }
 
     }
-    if(!queue && !nextWLReduceEvent.scheduled()){
+    if (!queue.empty() && !nextWLReduceEvent.scheduled()){
             schedule(nextWLReduceEvent, nextCycle());
     }
 }
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 7132283463..0393cd4cb5 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -37,9 +37,9 @@
 #include "base/statistics.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
-#include "params/MPU.hh"
+#include "params/WLEngine.hh"
 #include "sim/clocked_object.hh"
-
+#include "sim/port.hh"
 
 class WLEngine : public ClockedObject
 {
@@ -47,20 +47,26 @@ class WLEngine : public ClockedObject
 
     struct WLQueue{
       std::queue<PacketPtr> wlQueue;
-      const uint_32 queueSize;
+      const uint32_t queueSize;
       bool sendPktRetry;
 
       bool blocked(){
-        return wlQueue.size() == queueSize;
+        return (wlQueue.size() == queueSize);
       }
       bool empty(){
-        return wlQueue.empty();
+        return wlQueue->empty();
       }
       void push(PacketPtr pkt){
-        wlQueue.push(pkt);
+        wlQueue->push(pkt);
+      }
+      void pop(){
+        wlQueue->pop();
+      }
+      void front(){
+        wlQueue.front());
       }
 
-      WLReqPort(uint32_t qSize):
+      WLQueue(uint32_t qSize):
         queueSize(qSize){}
     };
 

From d65b96c0ab6fdd6763a6d940b8bcc8759153930e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 10:02:36 -0800
Subject: [PATCH 014/247] Addin simobject file and startup for PushEngine.

---
 src/accl/PushEngine.py  | 11 ++++++-----
 src/accl/push_engine.cc | 37 ++++++++++++++++++++++++++++++++++++-
 src/accl/push_engine.hh |  3 +++
 3 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/src/accl/PushEngine.py b/src/accl/PushEngine.py
index 37639377c1..3215fdaee2 100644
--- a/src/accl/PushEngine.py
+++ b/src/accl/PushEngine.py
@@ -26,14 +26,15 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from m5.params import *
-from m5.SimObject import SimObject
+from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
 class PushEngine(ClockedObject):
-    type = 'PushEngine'
+    type = 'WLEngine'
     cxx_header = "accl/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
-    respPort = ResponsePort("Receives requests from WorkList")
-    reqPort  = RequestPort("Sends requests to Push")
-    memPort  = RequestPort("Memory side port, sends requests")
+    system = Param.System(Parent.any, "The system object this push engine is a part of")
+    respPort = ResponsePort("Port to Receive updates from outside")
+    reqPort  = RequestPort("Port to send updates to the outside")
+    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index c02009d25a..f1f8f7698b 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -60,6 +60,40 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+PushEngine::startup()
+{
+    WorkListItem vertices [5] = {
+                                {0, 0, 3, 0}, // Addr: 0
+                                {0, 0, 1, 3}, // Addr: 16
+                                {0, 0, 1, 4}, // Addr: 32
+                                {0, 0, 0, 5}, // Addr: 48
+                                {0, 0, 0, 5}  // Addr: 64
+                                };
+    Edge edges [6] = {
+                    {0, 16}, // Addr: 1048576
+                    {0, 32}, // Addr: 1048592
+                    {0, 48}, // Addr: 1048608
+                    {0, 32}, // Addr: 1048624
+                    {0, 64}  // Addr: 1048640
+                    };
+
+    for (int i = 0; i < 5; i++) {
+        uint8_t* data = workListToMemory(vertices[i]);
+        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
+                                        16, data, requestorId);
+        memPort.sendFunctional(pkt);
+    }
+
+    for (int i = 0; i < 6; i++) {
+        uint8_t* data = edgeToMemory(edges[i]);
+        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
+                                        16, data, requestorId);
+        memPort.sendFunctional(pkt);
+    }
+
+}
+
 bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
 {
     return owner->handleUpdate(pkt);
@@ -104,7 +138,8 @@ void PushEngine::processNextReceiveEvent()
     std::vector<int> num_edge_queue;
 
     for (uint32_t index = 0; index < degree; index++) {
-        Addr edge_addr = (edge_index + index) * sizeof(Edge);
+        // FIXME: For now the base edge address is 1048576
+        Addr edge_addr = 1048576 + (edge_index + index) * sizeof(Edge);
         Addr req_addr = (edge_addr / 64) * 64;
         Addr req_offset = edge_addr % 64;
         if (addr_queue.size()) {
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index a746dcc265..077c61aa2b 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -39,6 +39,7 @@
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
 
+//FIXME: Add gem5 namespace here
 class PushEngine : public ClockedObject
 {
   private:
@@ -89,6 +90,8 @@ class PushEngine : public ClockedObject
         virtual bool recvTimingResp(PacketPtr pkt);
     }
 
+    virtual void startup() override;
+
     System* const system;
     const RequestorID requestorId;
 

From fb64f7d3e1c82b7a71b70a14215f8663c8908d65 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 12:26:30 -0800
Subject: [PATCH 015/247] Bug fixes.

---
 src/accl/SConscript |  8 ++---
 src/accl/util.cc    | 82 +++++++++++++++++++++++++--------------------
 src/accl/util.hh    |  7 ++--
 3 files changed, 55 insertions(+), 42 deletions(-)

diff --git a/src/accl/SConscript b/src/accl/SConscript
index da0774ca44..4b78ff9e80 100644
--- a/src/accl/SConscript
+++ b/src/accl/SConscript
@@ -28,9 +28,9 @@
 Import('*')
 
 SimObject('WLEngine.py')
-# SimObject('Apply.py')
-# SimObject('PushEngine.py')
+SimObject('Apply.py')
+SimObject('PushEngine.py')
 
-# Source('apply.cc')
+Source('apply.cc')
 Source('wl_engine.cc')
-# Source('push_engine.cc')
+Source('push_engine.cc')
diff --git a/src/accl/util.cc b/src/accl/util.cc
index 92f6a3e351..b81ba4db7d 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -28,8 +28,39 @@
 
 #include "accl/util.hh"
 
-#include "base/types.hh"
-#include "mem/packet.hh"
+WorkListItem&
+memoryToWorkList(uint8_t* data){
+    WorkListItem wl;
+    uint32_t temp_prop = *((uint32_t*) data));
+
+    uint32_t prop = *((uint32_t*) (data + 4));
+
+    uint32_t degree = *((uint32_t*) (data + 8));
+
+    uint32_t addr = *((uint32_t*) (data + 12));
+
+    retrun wl  = {temp_prop, prop, degree, addr};
+}
+
+uint8_t*
+workListToMemory(WorkListItem wl){
+    int  data_size = sizeof(WorkListItem) / sizeof(uint8_t);
+    uint8_t* data = new uint8_t [data_size];
+
+    uint32_t* tempPtr = (uint32_t*) data;
+    *tempPtr = wl.temp_prop;
+
+    uint32_t* propPtr = (uint32_t*) (data + 4);
+    *propPtr = wl.prop;
+
+    uint32_t* degreePtr = (uint32_t*) (data + 8);
+    *degreePtr = wl.degree;
+
+    uint32_t* edgePtr = (uint32_t*) (data + 12);
+    *edgePtr = wl.edgeIndex;
+
+    return data;
+}
 
 // Edge: (weight: 64 bits, neighbor: 64 bits)
 Edge&
@@ -58,7 +89,7 @@ edgeToMemory(Edge e)
     return data;
 }
 
-PacketPtr
+PacketPtr&
 getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, requestorId);
@@ -73,19 +104,24 @@ getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
     return pkt;
 }
 
-
-PacketPtr getWritePacket(Addr addr,
-               unsigned int size,
-               uint8_t* data,
-               RequestorID requestorId)
+PacketPtr&
+getWritePacket(Addr addr, unsigned int size,
+            uint8_t* data, RequestorID requestorId)
 {
-    equestPtr req = std::make_shared<Request>(addr, size, 0,
+    RequestPtr req = std::make_shared<Request>(addr, size, 0,
                                                requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
     req->setPC(((Addr)requestorId) << 2);
 
     PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
 
-PacketPtr
+PacketPtr&
 getUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0,
@@ -102,29 +138,3 @@ getUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
     return pkt;
 }
 
-WorkListItem&
-memoryToWorkList(uint8_t* data){
-    WorkListItem wl;
-    uint32_t temp_prop = *((uint32_t*) data));
-
-    uint32_t prop = *((uint32_t*) (data + 4));
-
-    uint32_t degree = *((uint32_t*) (data + 8));
-
-    uint32_t addr = *((uint32_t*) (data + 12));
-
-    retrun wl  = {temp_prop, prop, degree, addr};
-}
-
-unit8_t*
-workListToMemory(WorkListItem wl){
-    int  data_size = sizeof(WorkListItem)/sizeof(uint_8)
-    uint_8* data = new uint8_t [data_size];
-    uint_32* wList = (uint_32*)data;
-    *wList = wl.prop;
-    *wList + 1 = wl.temp_prop;
-    *wList + 2 = wl.degree;
-    *wList + 3 = wl.edgeIndex;
-
-    return data;
-}
diff --git a/src/accl/util.hh b/src/accl/util.hh
index 737d52e2a1..da5a0736c9 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -26,7 +26,6 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "base/addr_range_map.hh"
 #include "base/types.hh"
 #include "mem/packet.hh"
 
@@ -50,5 +49,9 @@ uint8_t* workListToMemory(WorkListItem wl);
 Edge& memoryToEdge(uint8_t* data);
 uint8_t* edgeToMemory(Edge e);
 
-PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId);
+PacketPtr& getReadPacket(Addr addr, unsigned int size,
+                            RequestorID requestorId);
+PacketPtr&
+getWritePacket(Addr addr, unsigned int size,
+                uint8_t* data, RequestorID requestorId);
 PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);

From 9eeb01889c5813d1f60ddfacda5e4c4538460860 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 13:05:49 -0800
Subject: [PATCH 016/247] More bug fixes.

---
 src/accl/SConscript     |  5 +++--
 src/accl/apply.cc       |  5 +++++
 src/accl/apply.hh       |  5 +++++
 src/accl/push_engine.cc |  6 +++++-
 src/accl/push_engine.hh |  5 ++++-
 src/accl/util.hh        |  3 +--
 src/accl/wl_engine.cc   |  4 ++++
 src/accl/wl_engine.hh   | 11 ++++++++---
 8 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/src/accl/SConscript b/src/accl/SConscript
index 4b78ff9e80..18ac71eb7d 100644
--- a/src/accl/SConscript
+++ b/src/accl/SConscript
@@ -27,10 +27,11 @@
 
 Import('*')
 
-SimObject('WLEngine.py')
 SimObject('Apply.py')
 SimObject('PushEngine.py')
+SimObject('WLEngine.py')
 
 Source('apply.cc')
-Source('wl_engine.cc')
 Source('push_engine.cc')
+Source('wl_engine.cc')
+Source('util.cc')
diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 985e6217d7..678f240bf6 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -30,6 +30,9 @@
 
 #include <string>
 
+namespace gem5
+{
+
 Apply::Apply(const ApplyParams &params):
     ClockedObject(params),
     system(params.system),
@@ -225,3 +228,5 @@ Apply::processNextApplyEvent(){
         schedule(nextApplyEvent, nextCycle());
     }
 }
+
+}
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index f4dabd6a97..42cb310136 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -42,6 +42,9 @@
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
 
+namespace gem5
+{
+
 class Apply : public ClockedObject
 {
   private:
@@ -177,4 +180,6 @@ class Apply : public ClockedObject
                   PortID idx=InvalidPortID) override;
 };
 
+}
+
 #endif // __ACCL_APPLY_HH__
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index f1f8f7698b..57fa560ff7 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -28,7 +28,9 @@
 
 #include "accl/util.hh"
 #include "accl/push_engine.hh"
-// #include "debug/PushEngine.hh"
+
+namespace gem5
+{
 
 PushEngine::PushEngine(const PushEngineParams &params) : ClockedObject(params),
     system(params.system),
@@ -243,3 +245,5 @@ void PushEngine::processNextSendEvent()
         schedule(nextSendEvent, nextCycle());
     }
 }
+
+}
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 077c61aa2b..cc129076a5 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -39,7 +39,9 @@
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
 
-//FIXME: Add gem5 namespace here
+namespace gem5
+{
+
 class PushEngine : public ClockedObject
 {
   private:
@@ -134,4 +136,5 @@ class PushEngine : public ClockedObject
 
 };
 
+}
 #endif // __ACCL_PUSH_ENGINE_HH__
diff --git a/src/accl/util.hh b/src/accl/util.hh
index da5a0736c9..76d67ce6df 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -51,7 +51,6 @@ uint8_t* edgeToMemory(Edge e);
 
 PacketPtr& getReadPacket(Addr addr, unsigned int size,
                             RequestorID requestorId);
-PacketPtr&
-getWritePacket(Addr addr, unsigned int size,
+PacketPtr& getWritePacket(Addr addr, unsigned int size,
                 uint8_t* data, RequestorID requestorId);
 PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 757bdd2598..00371e56cc 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -30,6 +30,8 @@
 
 #include <string>
 
+namespace gem5
+{
 
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
@@ -242,3 +244,5 @@ WLEngine::processNextWLReduceEvent(){
             schedule(nextWLReduceEvent, nextCycle());
     }
 }
+
+}
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 0393cd4cb5..8c69bba7f7 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -41,6 +41,9 @@
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
 
+namespace gem5
+{
+
 class WLEngine : public ClockedObject
 {
   private:
@@ -117,7 +120,7 @@ class WLEngine : public ClockedObject
       public:
         WLMemPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
-        void sendPacket(PacktPtr pkt);
+        void sendPacket(PacketPtr pkt);
         void trySendRetry();
         bool blocked(){
           return _blocked;
@@ -132,7 +135,7 @@ class WLEngine : public ClockedObject
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
     void readWLBuffer();
-    bool handleMemResp(PacktPtr resp);
+    bool handleMemResp(PacketPtr resp);
 
 
     //Events
@@ -174,4 +177,6 @@ class WLEngine : public ClockedObject
                   PortID idx=InvalidPortID) override;
 };
 
-#endif // __ACCL_WLE_HH__
\ No newline at end of file
+}
+
+#endif // __ACCL_WLE_HH__

From 6efe411a7a16cca5b80ce4fdecba591c1f9de67a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 14:18:14 -0800
Subject: [PATCH 017/247] Even more bug fixes.

---
 src/accl/push_engine.cc | 28 +++++++++++++++++++++++-----
 src/accl/push_engine.hh | 35 +++++++++++++++++++++--------------
 src/accl/util.cc        | 24 ++++++++++++++----------
 src/accl/util.hh        | 18 ++++++++++++------
 4 files changed, 70 insertions(+), 35 deletions(-)

diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index 57fa560ff7..56a57e76ac 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -65,6 +65,8 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 void
 PushEngine::startup()
 {
+    //FIXME: This is the current version of our initializer.
+    // This should be updated in the future.
     WorkListItem vertices [5] = {
                                 {0, 0, 3, 0}, // Addr: 0
                                 {0, 0, 1, 3}, // Addr: 16
@@ -109,6 +111,7 @@ PushEngine::PushRespPort::getAddrRanges()
 
 bool PushEngine::handleUpdate(PacketPtr pkt)
 {
+    //FIXME: There should be a check if the queues are full.
     // if (vertexQueueLen < vertexQueueSize) {
     //     vertexQueue.push(pkt)
     //         vertexQueueLen++;
@@ -192,20 +195,19 @@ bool PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt)
     return owner->handleMemResp(pkt);
 }
 
-void PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
+void
+PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
 {
     panic_if(_blocked, "Should never try to send if blocked MemSide!");
     // If we can't send the packet across the port, store it for later.
     if (!sendTimingReq(pkt))
     {
         blockedPacket = pkt;
-        DPRINTF(MemScheduler, "Setting blocked to true on port %s\n",
-                this->name());
         _blocked = true;
     }
 }
 
-void PushEngine::handleMemResp(PacketPtr pkt)
+bool PushEngine::handleMemResp(PacketPtr pkt)
 {
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
@@ -230,8 +232,12 @@ void PushEngine::handleMemResp(PacketPtr pkt)
     if (!nextSendEvent.scheduled() && !updateQueue.empty()) {
         schedule(nextSendEvent, nextCycle());
     }
-}
 
+    //TODO: Should we always return true? It's the response from the memory
+    // so maybe yes. We assume the receiving bandwidth of the PushEngine is
+    // higher than its demand bandwidth
+    return true;
+}
 
 void PushEngine::processNextSendEvent()
 {
@@ -246,4 +252,16 @@ void PushEngine::processNextSendEvent()
     }
 }
 
+void
+PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
 }
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index cc129076a5..7b5f483431 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -54,10 +54,10 @@ class PushEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        //TODO: Implement this;
-        PushRespPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
-
+        PushRespPort(const std::string& name, PushEngine* owner):
+          ResponsePort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
         virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
     }
@@ -65,27 +65,32 @@ class PushEngine : public ClockedObject
     class PushReqPort : public RequestPort
     {
       private:
+        PushEngine* owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
       public:
-        // TODO: Implement this;
-        PushReqPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
-
+        PushReqPort(const std::string& name, PushEngine* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
     }
 
     class PushMemPort : public RequestPort
     {
       private:
+        PushEngine* owner
         bool _blocked;
         PacketPtr blockedPacket;
 
       public:
-        // TODO: Implement this;
-        PushMemPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        PushMemPort(const std::string& name, PushEngine* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
 
         void sendPacket(PacktPtr pkt);
         bool blocked() { return _blocked; }
@@ -106,9 +111,9 @@ class PushEngine : public ClockedObject
     // int vertexQueueSize;
     // int vertexQueueLen;
 
-    std::unordered_map<req, Addr> reqOffsetMap;
-    std::unordered_map<req, int> reqNumEdgeMap;
-    std::unordered_map<req, uint32_t> reqValueMap;
+    std::unordered_map<RequestPtr, Addr> reqOffsetMap;
+    std::unordered_map<RequestPtr, int> reqNumEdgeMap;
+    std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
     std::queue<PacketPtr> memReqQueue; // Infinite queueing?
 
@@ -127,6 +132,8 @@ class PushEngine : public ClockedObject
 
     bool handleUpdate(PacketPtr pkt);
 
+    bool handleMemResp(PacketPtr pkt);
+
   public:
 
     PushEngine(const PushEngineParams &params);
diff --git a/src/accl/util.cc b/src/accl/util.cc
index b81ba4db7d..40a1fc761b 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -28,18 +28,20 @@
 
 #include "accl/util.hh"
 
-WorkListItem&
+namespace gem5
+{
+
+WorkListItem
 memoryToWorkList(uint8_t* data){
     WorkListItem wl;
-    uint32_t temp_prop = *((uint32_t*) data));
 
+    uint32_t temp_prop = *((uint32_t*) data);
     uint32_t prop = *((uint32_t*) (data + 4));
-
     uint32_t degree = *((uint32_t*) (data + 8));
-
     uint32_t addr = *((uint32_t*) (data + 12));
 
-    retrun wl  = {temp_prop, prop, degree, addr};
+    wl  = {temp_prop, prop, degree, addr};
+    return wl;
 }
 
 uint8_t*
@@ -63,7 +65,7 @@ workListToMemory(WorkListItem wl){
 }
 
 // Edge: (weight: 64 bits, neighbor: 64 bits)
-Edge&
+Edge
 memoryToEdge(uint8_t *data)
 {
     uint64_t weight = *((uint64_t*) data);
@@ -89,7 +91,7 @@ edgeToMemory(Edge e)
     return data;
 }
 
-PacketPtr&
+PacketPtr
 getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, requestorId);
@@ -104,7 +106,7 @@ getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
     return pkt;
 }
 
-PacketPtr&
+PacketPtr
 getWritePacket(Addr addr, unsigned int size,
             uint8_t* data, RequestorID requestorId)
 {
@@ -121,8 +123,9 @@ getWritePacket(Addr addr, unsigned int size,
     return pkt;
 }
 
-PacketPtr&
-getUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
+PacketPtr
+getUpdatePacket(Addr addr, unsigned int size,
+            uint8_t *data, RequestorID requestorId)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0,
                                                requestorId);
@@ -138,3 +141,4 @@ getUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
     return pkt;
 }
 
+}
diff --git a/src/accl/util.hh b/src/accl/util.hh
index 76d67ce6df..91692488a4 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -28,6 +28,10 @@
 
 #include "base/types.hh"
 #include "mem/packet.hh"
+#include "mem/request.hh"
+
+namespace gem5
+{
 
 struct WorkListItem
 {
@@ -41,16 +45,18 @@ struct Edge
 {
     uint64_t weight;
     Addr neighbor;
-}
+};
 
-WorkListItem& memoryToWorkList(uint8_t* data);
+WorkListItem memoryToWorkList(uint8_t* data);
 uint8_t* workListToMemory(WorkListItem wl);
 
-Edge& memoryToEdge(uint8_t* data);
+Edge memoryToEdge(uint8_t* data);
 uint8_t* edgeToMemory(Edge e);
 
-PacketPtr& getReadPacket(Addr addr, unsigned int size,
+PacketPtr getReadPacket(Addr addr, unsigned int size,
                             RequestorID requestorId);
-PacketPtr& getWritePacket(Addr addr, unsigned int size,
+PacketPtr getWritePacket(Addr addr, unsigned int size,
                 uint8_t* data, RequestorID requestorId);
-PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
+PacketPtr getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
+
+}

From fcdcceb33d9d2dc054f8ad021c0e39c8e4bff21e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 15:46:21 -0800
Subject: [PATCH 018/247] Bug fixes, bug fixes everywhere.

---
 src/accl/apply.cc       | 12 ++++----
 src/accl/apply.hh       | 61 ++++++++++++++++++++---------------------
 src/accl/push_engine.cc |  8 +++++-
 src/accl/push_engine.hh | 17 ++++++------
 src/accl/wl_engine.hh   | 17 +++++-------
 5 files changed, 60 insertions(+), 55 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 678f240bf6..c44738d3fa 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -62,14 +62,14 @@ Apply::getPort(const std::string &if_name, PortID idx)
 }
 
 AddrRangeList
-Apply::ApplyRespPort::getAddrRanges() const
+Apply::ApplyRespPort::getAddrRanges()
 {
     return owner->getAddrRanges();
 }
 
 bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
 {
-    if (!this->handleWL(pkt)){
+    if (!owner->handleWL(pkt)){
         return false;
     }
     return true;
@@ -82,15 +82,17 @@ Apply::ApplyRespPort::trySendRetry()
 }
 
 
-virtual bool
+bool
 Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
 {
-    return this->handleMemResp(pkt);
+    return owner->handleMemResp(pkt);
 }
 
 void
-WLEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
+Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
 {
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
         _blocked = true;
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index 42cb310136..788550646a 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -33,14 +33,13 @@
 #include <unordered_map>
 
 #include "accl/util.hh"
-#include "base/addr_range_map.hh"
-#include "base/statistics.hh"
-#include "base/types.hh"
+#include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
 #include "params/Apply.hh"
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
+#include "sim/system.hh"
 
 namespace gem5
 {
@@ -58,18 +57,18 @@ class Apply : public ClockedObject
         return (applyQueue.size() == queueSize);
       }
       bool empty(){
-        return applyQueue->empty();
+        return applyQueue.empty();
       }
       void push(PacketPtr pkt){
-        applyQueue->push(pkt);
+        applyQueue.push(pkt);
       }
 
       void pop(){
-        applyQueue->pop();
+        applyQueue.pop();
       }
 
       void front(){
-        applyQueue->front();
+        applyQueue.front();
       }
 
       ApplyQueue(uint32_t qSize):
@@ -80,16 +79,17 @@ class Apply : public ClockedObject
     {
       private:
         Apply *owner;
+        bool _blocked;
         PacketPtr blockedPacket;
 
       public:
-        ApplyRespPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        ApplyRespPort(const std::string& name, Apply* owner):
+          ResponsePort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
 
-        virtual AddrRangeList getAddrRanges();
         void trySendRetry();
-
-      protected:
+        virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
     };
 
@@ -101,12 +101,13 @@ class Apply : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        ApplyReqPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        ApplyReqPort(const std::string& name, Apply* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+
         void sendPacket(PacketPtr pkt);
-        bool blocked(){
-          return _blocked;
-        }
+        bool blocked() { return _blocked; }
 
       protected:
         void recvReqRetry() override;
@@ -121,13 +122,14 @@ class Apply : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        ApplyReqPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        ApplyMemPort(const std::string& name, Apply* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+
         void sendPacket(PacketPtr pkt);
         void trySendRetry();
-        bool blocked(){
-          return _blocked;
-        }
+        bool blocked(){ return _blocked;}
 
       protected:
         virtual bool recvTimingResp(PacketPtr pkt);
@@ -138,28 +140,24 @@ class Apply : public ClockedObject
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
     void readApplyBuffer();
-    bool handleMemResp(PacktPtr resp);
+    bool handleMemResp(PacketPtr resp);
     void writePushBuffer();
 
     //Events
     void processNextApplyCheckEvent();
+    EventFunctionWrapper nextApplyCheckEvent;
     /* Syncronously checked
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
     void processNextApplyEvent();
+    EventFunctionWrapper nextApplyEvent;
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
        Perform apply and send the write request and read edgeList
        read + write
        Write edgelist loc in buffer
     */
 
-    void processNextApplyEvent();
-    EventFunctionWrapper nextApplyEvent;
-
-    void processNextApplyCheckEvent();
-    EventFunctionWrapper nextApplyCheckEvent;
-
     System* const system;
     const RequestorID requestorId;
 
@@ -170,13 +168,14 @@ class Apply : public ClockedObject
 
     ApplyMemPort memPort;
     ApplyRespPort respPort;
-    ApplyRequestPort reqPort;
+    ApplyReqPort reqPort;
 
     std::unordered_map<RequestPtr, int> requestOffset;
 
   public:
     Apply(const ApplyParams &apply);
-    Port &getPort(const std::string &if_name,
+
+    Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 };
 
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index 56a57e76ac..48f1115042 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -106,7 +106,7 @@ bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
 AddrRangeList
 PushEngine::PushRespPort::getAddrRanges()
 {
-    owner->memPort->getAddrRanges();
+    owner->getAddrRanges();
 }
 
 bool PushEngine::handleUpdate(PacketPtr pkt)
@@ -264,4 +264,10 @@ PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
     }
 }
 
+AddrRangeList
+PushEngine::getAddrRanges()
+{
+    return memPort.getAddrRanges();
+}
+
 }
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 7b5f483431..d478d14df0 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -31,8 +31,7 @@
 
 #include <queue>
 
-#include "base/addr_range_map.hh"
-#include "base/statistics.hh"
+#include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
 #include "params/PushEngine.hh"
@@ -60,7 +59,7 @@ class PushEngine : public ClockedObject
         {}
         virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
-    }
+    };
 
     class PushReqPort : public RequestPort
     {
@@ -77,12 +76,12 @@ class PushEngine : public ClockedObject
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
-    }
+    };
 
     class PushMemPort : public RequestPort
     {
       private:
-        PushEngine* owner
+        PushEngine* owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
@@ -92,10 +91,10 @@ class PushEngine : public ClockedObject
           _blocked(false), blockedPacket(nullptr)
         {}
 
-        void sendPacket(PacktPtr pkt);
+        void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
-    }
+    };
 
     virtual void startup() override;
 
@@ -134,11 +133,13 @@ class PushEngine : public ClockedObject
 
     bool handleMemResp(PacketPtr pkt);
 
+    AddrRangeList getAddrRanges();
+
   public:
 
     PushEngine(const PushEngineParams &params);
 
-    Port &getPort(const std::string &if_name,
+    Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
 };
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 8c69bba7f7..6f875adfed 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -33,13 +33,13 @@
 #include <unordered_map>
 
 #include "accl/util.hh"
-#include "base/addr_range_map.hh"
-#include "base/statistics.hh"
+#include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
 #include "params/WLEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
+#include "sim/system.hh"
 
 namespace gem5
 {
@@ -140,40 +140,37 @@ class WLEngine : public ClockedObject
 
     //Events
     void processNextWLReadEvent();
+    EventFunctionWrapper nextWLReadEvent;
     /* Syncronously checked
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
     void processNextWLReduceEvent();
+    EventFunctionWrapper nextWLReduceEvent;
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
        Perform apply and send the write request and read edgeList
        read + write
        Write edgelist loc in buffer
     */
-    void processNextWLReadEvent();
-    EventFunctionWrapper nextWLReadEvent;
-
-    void processNextWLReduceEvent();
-    EventFunctionWrapper nextWLReduceEvent;
 
     System* const system;
     const RequestorID requestorId;
+
     std::unordered_map<RequestPtr, int> requestOffset;
 
     AddrRangeList getAddrRanges() const;
 
     WLQueue updateQueue;
     WLQueue responseQueue;
-    WLMemPort memPort;
 
     WLMemPort memPort;
     WLRespPort respPort;
-    WLRequestPort reqPort;
+    WLReqPort reqPort;
 
    public:
 
     WLEngine(const WLEngineParams &params);
-    Port &getPort(const std::string &if_name,
+    Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 };
 

From 750510f593e59e776bbfb2906a8b8e082669aa36 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 14 Feb 2022 21:33:53 -0800
Subject: [PATCH 019/247] arch: Debugging worklist engine

[wip] Adding some missing virtual functions.

Change-Id: I26f6c7d789f4b295bac3bc9b2a80f2cadb45b96f
---
 src/accl/wl_engine.cc | 26 +++++++++++++++++++++++++-
 src/accl/wl_engine.hh |  4 ++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 00371e56cc..7515e10167 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -81,6 +81,24 @@ WLEngine::WLRespPort::trySendRetry()
     sendRetryReq();
 }
 
+virtual void
+WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+virtual Tick
+WLEngine::WLRespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+virtual void
+WLEngine::WLRespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
 void
 WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
 {
@@ -137,6 +155,12 @@ WLEngine::getAddrRanges() const
     return memPort.getAddrRanges();
 }
 
+void
+WLEngine::recvFunctional(PacketPtr pkt)
+{
+    memPort.recvFunctional(pkt);
+}
+
 bool WLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
     if (queue.blocked()){
@@ -164,7 +188,7 @@ void WLEngine::processNextWLReadEvent(){
         PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
         requestOffset[request] = req_offset;
         if (!memPort.blocked()){
-            queue.pop()
+            queue.pop();
             memPort.sendPacket(memPkt);
             break;
         }
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 6f875adfed..d2b96db203 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -88,6 +88,9 @@ class WLEngine : public ClockedObject
 
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
     };
 
     class WLReqPort : public RequestPort //To Apply Engine
@@ -159,6 +162,7 @@ class WLEngine : public ClockedObject
     std::unordered_map<RequestPtr, int> requestOffset;
 
     AddrRangeList getAddrRanges() const;
+    void recvFunctional(PacketPtr pkt);
 
     WLQueue updateQueue;
     WLQueue responseQueue;

From 79429d177df5baef0d3cd4fc33a4db249d66db37 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 21:38:05 -0800
Subject: [PATCH 020/247] Bug fix.

---
 src/accl/Apply.py       |  1 +
 src/accl/apply.cc       |  6 ++---
 src/accl/push_engine.cc | 50 ++++++++++++++++++++++++++++++++---------
 src/accl/push_engine.hh |  3 +++
 4 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/src/accl/Apply.py b/src/accl/Apply.py
index 01c627d4c8..58639e880a 100644
--- a/src/accl/Apply.py
+++ b/src/accl/Apply.py
@@ -34,6 +34,7 @@ class Apply(ClockedObject):
     cxx_header = "accl/apply.hh"
     cxx_class = 'gem5::Apply'
 
+    system = Param.System(Parent.any, "The system object this apply engine is a part of")
     respPort = ResponsePort("Receives requests from WorkList")
     reqPort  = RequestPort("Sends requests to Push")
     memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index c44738d3fa..70bc8031c9 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -40,8 +40,8 @@ Apply::Apply(const ApplyParams &params):
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
-    nextApplyEvent([this]{processNextApplyEvent; }, name()),
-    nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name())
+    nextApplyEvent([this]{ processNextApplyEvent(); }, name()),
+    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name())
 {
     applyReadQueue(params.applyQueueSize);
     applyWriteQueue(params.applyQueueSize);
@@ -172,7 +172,7 @@ void Apply::processNextApplyCheckEvent(){
 }
 
 bool
-Apply::handleMemResp(PacktPtr pkt)
+Apply::handleMemResp(PacketPtr pkt)
 {
     auto queue = applyWriteQueue;
 
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index 48f1115042..6ebe34ebd3 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -106,7 +106,7 @@ bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
 AddrRangeList
 PushEngine::PushRespPort::getAddrRanges()
 {
-    owner->getAddrRanges();
+    return owner->getAddrRanges();
 }
 
 bool PushEngine::handleUpdate(PacketPtr pkt)
@@ -121,7 +121,7 @@ bool PushEngine::handleUpdate(PacketPtr pkt)
     //     return true;
     // }
     // return false;
-    vertexQueue.push(pkt)
+    vertexQueue.push(pkt);
     if (!nextReceiveEvent.scheduled()) {
         schedule(nextReceiveEvent, nextCycle());
     }
@@ -130,8 +130,8 @@ bool PushEngine::handleUpdate(PacketPtr pkt)
 
 void PushEngine::processNextReceiveEvent()
 {
-    PacketPtr updatePkt = vertexQueue.pop();
-    uint8_t *data = updatePkt->getData<uint8_t>();
+    PacketPtr updatePkt = vertexQueue.front();
+    uint8_t *data = updatePkt->getPtr<uint8_t>();
 
     // data: (edge_index: 32 bits, degree: 32 bits, value: 32 bits)
     uint32_t edge_index = *((uint32_t *)data);
@@ -152,19 +152,19 @@ void PushEngine::processNextReceiveEvent()
                 num_edge_queue.back()++;
             }
             else {
-                addr_queue.push(req_addr);
-                offset_queue.push(req_offset);
-                num_edge_queue.push(1);
+                addr_queue.push_back(req_addr);
+                offset_queue.push_back(req_offset);
+                num_edge_queue.push_back(1);
             }
         }
         else {
-            addr_queue.push(req_addr);
-            offset_queue.push(req_offset);
-            num_edge_queue.push(1);
+            addr_queue.push_back(req_addr);
+            offset_queue.push_back(req_offset);
+            num_edge_queue.push_back(1);
         }
     }
 
-    for (int index = 0; index < addr_queue.size(); inedx++) {
+    for (int index = 0; index < addr_queue.size(); index++) {
         PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
         memReqQueue.push(pkt);
         reqOffsetMap[pkt->req] = offset_queue[index];
@@ -172,6 +172,8 @@ void PushEngine::processNextReceiveEvent()
         reqValueMap[pkt->req] = value;
     }
 
+    vertexQueue.pop();
+
     if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
         schedule(nextReadEvent, nextCycle());
     }
@@ -264,10 +266,36 @@ PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
     }
 }
 
+void
+PushEngine::PushReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
 AddrRangeList
 PushEngine::getAddrRanges()
 {
     return memPort.getAddrRanges();
 }
 
+void
+PushEngine::PushMemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
 }
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index d478d14df0..0acedd0da8 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -59,6 +59,7 @@ class PushEngine : public ClockedObject
         {}
         virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
+
     };
 
     class PushReqPort : public RequestPort
@@ -76,6 +77,7 @@ class PushEngine : public ClockedObject
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
     };
 
     class PushMemPort : public RequestPort
@@ -94,6 +96,7 @@ class PushEngine : public ClockedObject
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
     };
 
     virtual void startup() override;

From 228fcf05f87be11a23ee5cfb8dec41d5b8dbcedd Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 21:46:27 -0800
Subject: [PATCH 021/247] Bug fix.

---
 src/accl/Apply.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/Apply.py b/src/accl/Apply.py
index 58639e880a..d6a4bbe5a9 100644
--- a/src/accl/Apply.py
+++ b/src/accl/Apply.py
@@ -26,7 +26,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from m5.params import *
-from m5.SimObject import SimObject
+from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
 class Apply(ClockedObject):

From 709a21552623e2f112730512a1652d0436ccce03 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 21:47:36 -0800
Subject: [PATCH 022/247] Fixing a bug-fix.

---
 src/accl/apply.hh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index 788550646a..e1b6d33359 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -88,6 +88,7 @@ class Apply : public ClockedObject
           _blocked(false), blockedPacket(nullptr)
         {}
 
+      protected:
         void trySendRetry();
         virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);

From c1dd68a3e06a498b89cbb043f4779865ecad91b3 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 15 Feb 2022 00:13:21 -0800
Subject: [PATCH 023/247] fixing some bugs

---
 src/accl/Apply.py     |  1 +
 src/accl/WLEngine.py  |  4 +++-
 src/accl/apply.cc     | 31 ++++++++++++++++++++----
 src/accl/apply.hh     | 23 ++++++++++--------
 src/accl/wl_engine.cc | 48 +++++++++++++++++++------------------
 src/accl/wl_engine.hh | 55 ++++++++++++++++++++++++-------------------
 6 files changed, 99 insertions(+), 63 deletions(-)

diff --git a/src/accl/Apply.py b/src/accl/Apply.py
index d6a4bbe5a9..8720287cc8 100644
--- a/src/accl/Apply.py
+++ b/src/accl/Apply.py
@@ -38,3 +38,4 @@ class Apply(ClockedObject):
     respPort = ResponsePort("Receives requests from WorkList")
     reqPort  = RequestPort("Sends requests to Push")
     memPort  = RequestPort("Memory side port, sends requests")
+    applyQueueSize = Param.Unsigned(32, "Size of write queue")
diff --git a/src/accl/WLEngine.py b/src/accl/WLEngine.py
index fe6b25b6ba..562fd04423 100644
--- a/src/accl/WLEngine.py
+++ b/src/accl/WLEngine.py
@@ -26,7 +26,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from m5.params import *
-from m5.SimObject import SimObject
+from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
 class WLEngine(ClockedObject):
@@ -34,6 +34,8 @@ class WLEngine(ClockedObject):
     cxx_header = "accl/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
+    system = Param.System(Parent.any, "The system object this push WorkList is a part of")
     respPort = ResponsePort("Receives updates")
     reqPort  = RequestPort("Sends requests to Apply")
     memPort  = RequestPort("Memory side port, sends requests")
+    wlQueueSize = Param.Unsigned(32, "Size of write queue")
diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 70bc8031c9..410eff5268 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -81,6 +81,23 @@ Apply::ApplyRespPort::trySendRetry()
     sendRetryReq();
 }
 
+void
+Apply::ApplyRespPort::recvFunctional(PacketPtr pkt)
+{
+    panic("Not implemented");
+}
+
+Tick
+Apply::ApplyRespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+Apply::ApplyRespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
 
 bool
 Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
@@ -139,7 +156,7 @@ Apply::getAddrRanges() const
 bool Apply::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
     if (queue.blocked()){
-        sendPktRetry = true;
+        queue.sendPktRetry = true;
         return false;
     } else{
         queue.push(pkt);
@@ -177,7 +194,7 @@ Apply::handleMemResp(PacketPtr pkt)
     auto queue = applyWriteQueue;
 
         if (queue.blocked()){
-            sendPktRetry = true;
+            queue.sendPktRetry = true;
             return false;
         } else
             queue.push(pkt);
@@ -192,7 +209,7 @@ Apply::handleMemResp(PacketPtr pkt)
 void
 Apply::processNextApplyEvent(){
     auto queue = applyWriteQueue;
-        auto pkt = queue.front();
+        PacketPtr pkt = queue.front();
         uint8_t* data = pkt->getPtr<uint8_t>();
 
         RequestPtr request = pkt->req;
@@ -204,7 +221,11 @@ Apply::processNextApplyEvent(){
         if (temp_prop != prop){
             if (!memPort.blocked() && !reqPort.blocked()){
                 //update prop with temp_prop
-                wl.prop = std::min(prop , temp_prop);
+                if(prop < temp_prop){
+                    wl.prop = prop;
+                }else{
+                    wl.prop = temp_prop;
+                }
                 //write back the new worklist item to  memory
                 uint8_t* wList = workListToMemory(wl);
                 memcpy(data + request_offset, wList, sizeof(WorkListItem));
@@ -212,7 +233,7 @@ Apply::processNextApplyEvent(){
                 PacketPtr writePkt  =
                 getWritePacket(pkt->getAddr(), 64, data, requestorId);
                 memPort.sendPacket(writePkt);
-                applyReqPort.sendPacket(writePkt);
+                reqPort.sendPacket(writePkt);
                 queue.pop();
                 if (queue.sendPktRetry && !queue.blocked()){
                     memPort.trySendRetry();
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index e1b6d33359..f08c1fef85 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -63,12 +63,12 @@ class Apply : public ClockedObject
         applyQueue.push(pkt);
       }
 
-      void pop(){
-        applyQueue.pop();
+      PacketPtr pop(){
+        return applyQueue->pop();
       }
 
-      void front(){
-        applyQueue.front();
+      PacketPtr front(){
+        return applyQueue.front();
       }
 
       ApplyQueue(uint32_t qSize):
@@ -83,15 +83,18 @@ class Apply : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
+        void trySendRetry();
+        virtual AddrRangeList getAddrRanges();
         ApplyRespPort(const std::string& name, Apply* owner):
           ResponsePort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
 
       protected:
-        void trySendRetry();
-        virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
     };
 
     class ApplyReqPort : public RequestPort
@@ -137,6 +140,10 @@ class Apply : public ClockedObject
         void recvReqRetry() override;
     };
 
+    ApplyMemPort memPort;
+    ApplyRespPort respPort;
+    ApplyReqPort reqPort;
+
     bool handleWL(PacketPtr pkt);
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
@@ -167,10 +174,6 @@ class Apply : public ClockedObject
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
 
-    ApplyMemPort memPort;
-    ApplyRespPort respPort;
-    ApplyReqPort reqPort;
-
     std::unordered_map<RequestPtr, int> requestOffset;
 
   public:
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 7515e10167..9b16a15575 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -36,6 +36,7 @@ namespace gem5
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
     system(params.system),
+    queueSize(params.wlQueueSize),
     requestorId(system->getRequestorId(this)),
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
@@ -43,8 +44,8 @@ WLEngine::WLEngine(const WLEngineParams &params):
     nextWLReadEvent([this]{processNextWLReadEvent; }, name()),
     nextWLReduceEvent([this]{processNextWLReduceEvent; }, name())
 {
-    updateQueue(params.wlQueueSize);
-    responseQueue(params.wlQueueSize);
+    updateQueue.resize(queueSize);
+    responseQueue.resize(queueSize);
 }
 
 Port &
@@ -69,7 +70,7 @@ WLEngine::WLRespPort::getAddrRanges() const
 
 bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
 {
-    if (!this->handleWLUpdate(pkt)){
+    if (!owner->handleWLUpdate(pkt)){
         return false;
     }
     return true;
@@ -81,19 +82,19 @@ WLEngine::WLRespPort::trySendRetry()
     sendRetryReq();
 }
 
-virtual void
+void
 WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
 {
     owner->recvFunctional(pkt);
 }
 
-virtual Tick
+Tick
 WLEngine::WLRespPort::recvAtomic(PacketPtr pkt)
 {
     panic("recvAtomic unimpl.");
 }
 
-virtual void
+void
 WLEngine::WLRespPort::recvRespRetry()
 {
     panic("recvRespRetry from response port is called.");
@@ -118,10 +119,10 @@ WLEngine::WLMemPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
-virtual bool
+bool
 WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
 {
-    return this->handleMemResp(pkt);
+    return owner->handleMemResp(pkt);
 }
 
 void
@@ -177,15 +178,14 @@ bool WLEngine::handleWLUpdate(PacketPtr pkt){
 
 void WLEngine::processNextWLReadEvent(){
     auto queue = updateQueue;
-    auto memPort = WLMemPort;
     while (!queue.empty()){ //create a map instead of front
-        auto pkt = queue.front()
+        PacketPtr pkt = queue.front();
         /// conver to ReadReq
         Addr req_addr = (pkt->getAddr() / 64) * 64;
         int req_offset = (pkt->getAddr()) % 64;
         RequestPtr request =
             std::make_shared<Request>(req_addr, 64, 0 ,0);
-        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+        PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
         requestOffset[request] = req_offset;
         if (!memPort.blocked()){
             queue.pop();
@@ -199,15 +199,15 @@ void WLEngine::processNextWLReadEvent(){
 }
 
 bool
-WLEngine::handleMemResp(PacktPtr pkt)
+WLEngine::handleMemResp(PacketPtr pkt)
 {
     auto queue = responseQueue;
         if (queue.blocked()){
-            sendPktRetry = true;
+            queue.sendPktRetry = true;
             return false;
-        } else
-            queue.push(writePkt);
-
+        } else{
+            queue.push(pkt);
+        }
         if(!nextWLReduceEvent.scheduled()){
             schedule(nextWLReduceEvent, nextCycle());
         }
@@ -219,18 +219,20 @@ void
 WLEngine::processNextWLReduceEvent(){
     auto queue = responseQueue;
     auto updateQ = updateQueue;
-    applyPort = reqPort;
-    auto update = updateQ.front();
-    auto value = update->getPtr<uint8_t>();
-    auto pkt = queue.front();
+    auto applyPort = reqPort;
+    PacketPtr update = updateQ.front();
+    uint8_t* value = update->getPtr<uint8_t>();
+    PacketPtr pkt = queue.front();
     uint8_t* data = pkt->getPtr<uint8_t>();
     RequestPtr request = pkt->req;
     int request_offset = requestOffset[request];
-    WorkListItem wl =  memoryToWorkList(data + request_offset)
+    WorkListItem wl =  memoryToWorkList(data + request_offset);
     uint32_t temp_prop = wl.temp_prop;
     if (temp_prop != *value){
         //update prop with temp_prop
-        temp_prop = std::min(value , temp_prop);
+        if(*value < temp_prop){
+            temp_prop = *value;
+        }
         if (!memPort.blocked() && !applyPort.blocked()){
             wl.temp_prop = temp_prop;
             uint8_t* wlItem = workListToMemory(wl);
@@ -257,7 +259,7 @@ WLEngine::processNextWLReduceEvent(){
             memPort.trySendRetry();
             queue.sendPktRetry = false;
         }
-        updateQ.pop()
+        updateQ.pop();
         if (!updateQ.blocked() & updateQ.sendPktRetry){
             respPort.trySendRetry();
             updateQ.sendPktRetry = false;
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index d2b96db203..8d02c16981 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -50,27 +50,32 @@ class WLEngine : public ClockedObject
 
     struct WLQueue{
       std::queue<PacketPtr> wlQueue;
-      const uint32_t queueSize;
+      uint32_t queueSize;
       bool sendPktRetry;
 
+      void resize(uint32_t size){
+        queueSize = size;
+      }
+
       bool blocked(){
         return (wlQueue.size() == queueSize);
       }
       bool empty(){
-        return wlQueue->empty();
+        return wlQueue.empty();
       }
       void push(PacketPtr pkt){
-        wlQueue->push(pkt);
+        wlQueue.push(pkt);
       }
       void pop(){
-        wlQueue->pop();
+        wlQueue.pop();
       }
-      void front(){
-        wlQueue.front());
+      PacketPtr front(){
+        return wlQueue.front();
       }
 
       WLQueue(uint32_t qSize):
-        queueSize(qSize){}
+        queueSize(qSize),
+        sendPktRetry(false){}
     };
 
     class WLRespPort : public ResponsePort //From Push engine
@@ -83,7 +88,7 @@ class WLEngine : public ClockedObject
         WLRespPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
 
-        virtual AddrRangeList getAddrRanges();
+        virtual AddrRangeList getAddrRanges() const override;
         void trySendRetry();
 
       protected:
@@ -129,50 +134,52 @@ class WLEngine : public ClockedObject
           return _blocked;
         }
 
-    protected:
-      virtual bool recvTimingResp(PacketPtr pkt);
-      void recvReqRetry() override;
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        void recvReqRetry() override;
     };
 
+    System* const system;
+    const uint32_t queueSize;
+    const RequestorID requestorId;
+
+    WLReqPort reqPort;
+    WLRespPort respPort;
+    WLMemPort memPort;
+
     bool handleWLU(PacketPtr pkt);
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
     void readWLBuffer();
-    bool handleMemResp(PacketPtr resp);
 
 
     //Events
-    void processNextWLReadEvent();
     EventFunctionWrapper nextWLReadEvent;
+    void processNextWLReadEvent();
     /* Syncronously checked
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
-    void processNextWLReduceEvent();
     EventFunctionWrapper nextWLReduceEvent;
+    void processNextWLReduceEvent();
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
        Perform apply and send the write request and read edgeList
        read + write
        Write edgelist loc in buffer
     */
 
-    System* const system;
-    const RequestorID requestorId;
-
     std::unordered_map<RequestPtr, int> requestOffset;
 
-    AddrRangeList getAddrRanges() const;
-    void recvFunctional(PacketPtr pkt);
-
     WLQueue updateQueue;
     WLQueue responseQueue;
 
-    WLMemPort memPort;
-    WLRespPort respPort;
-    WLReqPort reqPort;
 
-   public:
 
+   public:
+    AddrRangeList getAddrRanges() const;
+    bool handleWLUpdate(PacketPtr pkt);
+    bool handleMemResp(PacketPtr resp);
+    void recvFunctional(PacketPtr pkt);
     WLEngine(const WLEngineParams &params);
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;

From 90800d55dd30af7e3fb47173bad39c3adf11ccbd Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 09:24:55 -0800
Subject: [PATCH 024/247] Bug fix.

---
 src/accl/push_engine.cc | 26 ++++++++++++++++++++------
 src/accl/push_engine.hh | 13 ++++++++++++-
 src/accl/wl_engine.cc   |  9 ++-------
 src/accl/wl_engine.hh   |  3 +--
 4 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index 6ebe34ebd3..746ed8a142 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -98,18 +98,32 @@ PushEngine::startup()
 
 }
 
-bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
+AddrRangeList
+PushEngine::PushRespPort::getAddrRanges()
+{
+    return owner->getAddrRanges();
+}
+
+bool
+PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
 {
     return owner->handleUpdate(pkt);
 }
 
-AddrRangeList
-PushEngine::PushRespPort::getAddrRanges()
+Tick
+PushEngine::PushRespPort::recvAtomic(PacketPtr pkt)
 {
-    return owner->getAddrRanges();
+    panic("recvAtomic unimpl.");
+}
+
+void
+PushEngine::PushRespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
 }
 
-bool PushEngine::handleUpdate(PacketPtr pkt)
+bool
+PushEngine::handleUpdate(PacketPtr pkt)
 {
     //FIXME: There should be a check if the queues are full.
     // if (vertexQueueLen < vertexQueueSize) {
@@ -131,7 +145,7 @@ bool PushEngine::handleUpdate(PacketPtr pkt)
 void PushEngine::processNextReceiveEvent()
 {
     PacketPtr updatePkt = vertexQueue.front();
-    uint8_t *data = updatePkt->getPtr<uint8_t>();
+    uint8_t* data = updatePkt->getPtr<uint8_t>();
 
     // data: (edge_index: 32 bits, degree: 32 bits, value: 32 bits)
     uint32_t edge_index = *((uint32_t *)data);
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 0acedd0da8..1aa70c7acb 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -58,8 +58,12 @@ class PushEngine : public ClockedObject
           _blocked(false), blockedPacket(nullptr)
         {}
         virtual AddrRangeList getAddrRanges();
-        virtual bool recvTimingReq(PacketPtr pkt);
 
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
     };
 
     class PushReqPort : public RequestPort
@@ -76,6 +80,8 @@ class PushEngine : public ClockedObject
         {}
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
+
+      protected:
         virtual bool recvTimingResp(PacketPtr pkt);
         virtual void recvReqRetry();
     };
@@ -95,6 +101,8 @@ class PushEngine : public ClockedObject
 
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
+
+      protected:
         virtual bool recvTimingResp(PacketPtr pkt);
         virtual void recvReqRetry();
     };
@@ -138,6 +146,8 @@ class PushEngine : public ClockedObject
 
     AddrRangeList getAddrRanges();
 
+    void recvFunctional(PacketPtr pkt);
+
   public:
 
     PushEngine(const PushEngineParams &params);
@@ -148,4 +158,5 @@ class PushEngine : public ClockedObject
 };
 
 }
+
 #endif // __ACCL_PUSH_ENGINE_HH__
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 9b16a15575..bfabed33e9 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -76,12 +76,6 @@ bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
     return true;
 }
 
-void
-WLEngine::WLRespPort::trySendRetry()
-{
-    sendRetryReq();
-}
-
 void
 WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
 {
@@ -162,7 +156,8 @@ WLEngine::recvFunctional(PacketPtr pkt)
     memPort.recvFunctional(pkt);
 }
 
-bool WLEngine::handleWLUpdate(PacketPtr pkt){
+bool
+WLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
     if (queue.blocked()){
         queue.sendPktRetry = true;
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 8d02c16981..ad53fd7e7e 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -88,8 +88,7 @@ class WLEngine : public ClockedObject
         WLRespPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
 
-        virtual AddrRangeList getAddrRanges() const override;
-        void trySendRetry();
+        virtual AddrRangeList getAddrRanges();
 
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);

From f62d592c1a5a1f7d397e025a6d9f8a8037a17e12 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 09:36:23 -0800
Subject: [PATCH 025/247] Bug fix.

---
 src/accl/push_engine.cc | 24 ++++++++++++++++++------
 src/accl/push_engine.hh | 13 +++++--------
 src/accl/wl_engine.cc   |  2 +-
 3 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index 746ed8a142..bf385818f5 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -122,6 +122,24 @@ PushEngine::PushRespPort::recvFunctional(PacketPtr pkt)
     owner->recvFunctional(pkt);
 }
 
+void
+PushEngine::PushRespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+AddrRangeList
+PushEngine::getAddrRanges()
+{
+    return memPort.getAddrRanges();
+}
+
+void
+PushEngine::recvFunctional(PacketPtr pkt)
+{
+    memPort.sendFunctional(pkt);
+}
+
 bool
 PushEngine::handleUpdate(PacketPtr pkt)
 {
@@ -293,12 +311,6 @@ PushEngine::PushReqPort::recvReqRetry()
     }
 }
 
-AddrRangeList
-PushEngine::getAddrRanges()
-{
-    return memPort.getAddrRanges();
-}
-
 void
 PushEngine::PushMemPort::recvReqRetry()
 {
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 1aa70c7acb..269170c045 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -131,23 +131,20 @@ class PushEngine : public ClockedObject
     // int updateQueueSize;
     // int updateQueueLen;
 
+    AddrRangeList getAddrRanges();
+    void recvFunctional(PacketPtr pkt);
+
+    bool handleUpdate(PacketPtr pkt);
     EventFunctionWrapper nextReceiveEvent;
     void processNextReceiveEvent();
 
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
+    bool handleMemResp(PacketPtr pkt);
     EventFunctionWrapper nextSendEvent;
     void processNextSendEvent();
 
-    bool handleUpdate(PacketPtr pkt);
-
-    bool handleMemResp(PacketPtr pkt);
-
-    AddrRangeList getAddrRanges();
-
-    void recvFunctional(PacketPtr pkt);
-
   public:
 
     PushEngine(const PushEngineParams &params);
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index bfabed33e9..8365e754fc 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -153,7 +153,7 @@ WLEngine::getAddrRanges() const
 void
 WLEngine::recvFunctional(PacketPtr pkt)
 {
-    memPort.recvFunctional(pkt);
+    memPort.sendFunctional(pkt);
 }
 
 bool

From e4cbf3493f1179d195209bc0aa007c7cda112506 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 09:50:05 -0800
Subject: [PATCH 026/247] Bug fixes.

---
 src/accl/wl_engine.cc | 16 +++++++++++-----
 src/accl/wl_engine.hh |  6 +++---
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 8365e754fc..872f38673e 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -41,11 +41,11 @@ WLEngine::WLEngine(const WLEngineParams &params):
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
-    nextWLReadEvent([this]{processNextWLReadEvent; }, name()),
-    nextWLReduceEvent([this]{processNextWLReduceEvent; }, name())
+    nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()),
+    nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name()),
+    updateQueue(queueSize),
+    responseQueue(queueSize)
 {
-    updateQueue.resize(queueSize);
-    responseQueue.resize(queueSize);
 }
 
 Port &
@@ -88,6 +88,12 @@ WLEngine::WLRespPort::recvAtomic(PacketPtr pkt)
     panic("recvAtomic unimpl.");
 }
 
+void
+WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
 void
 WLEngine::WLRespPort::recvRespRetry()
 {
@@ -256,7 +262,7 @@ WLEngine::processNextWLReduceEvent(){
         }
         updateQ.pop();
         if (!updateQ.blocked() & updateQ.sendPktRetry){
-            respPort.trySendRetry();
+            // respPort.trySendRetry();
             updateQ.sendPktRetry = false;
         }
 
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index ad53fd7e7e..fe26d22aef 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -172,14 +172,14 @@ class WLEngine : public ClockedObject
     WLQueue updateQueue;
     WLQueue responseQueue;
 
-
-
-   public:
     AddrRangeList getAddrRanges() const;
     bool handleWLUpdate(PacketPtr pkt);
     bool handleMemResp(PacketPtr resp);
     void recvFunctional(PacketPtr pkt);
+
+   public:
     WLEngine(const WLEngineParams &params);
+
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 };

From b1e3386565a90f3c4170c72688da1e7f01a3ef7f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 09:56:40 -0800
Subject: [PATCH 027/247] Bug fix.

---
 src/accl/push_engine.hh |  5 +----
 src/accl/wl_engine.cc   |  2 +-
 src/accl/wl_engine.hh   | 28 +++++++++++++---------------
 3 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 269170c045..ea9026ff8f 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -49,13 +49,10 @@ class PushEngine : public ClockedObject
     {
       private:
         PushEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
 
       public:
         PushRespPort(const std::string& name, PushEngine* owner):
-          ResponsePort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
+          ResponsePort(name, owner), owner(owner)
         {}
         virtual AddrRangeList getAddrRanges();
 
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 872f38673e..98c940a2de 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -262,7 +262,7 @@ WLEngine::processNextWLReduceEvent(){
         }
         updateQ.pop();
         if (!updateQ.blocked() & updateQ.sendPktRetry){
-            // respPort.trySendRetry();
+            respPort.trySendRetry();
             updateQ.sendPktRetry = false;
         }
 
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index fe26d22aef..94ac7c7aff 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -82,12 +82,11 @@ class WLEngine : public ClockedObject
     {
       private:
         WLEngine *owner;
-        PacketPtr blockedPacket;
 
       public:
-        WLRespPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
-
+        WLRespPort(const std::string& name, WLEngine* owner):
+          ResponsePort(name, owner), owner(owner)
+        {}
         virtual AddrRangeList getAddrRanges();
 
       protected:
@@ -105,12 +104,12 @@ class WLEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        WLReqPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        WLReqPort(const std::string& name, WLEngine* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
         void sendPacket(PacketPtr pkt);
-        bool blocked(){
-          return _blocked;
-        }
+        bool blocked() { return _blocked; }
 
       protected:
         void recvReqRetry() override;
@@ -125,13 +124,12 @@ class WLEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        WLMemPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        WLMemPort(const std::string& name, WLEngine* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
         void sendPacket(PacketPtr pkt);
-        void trySendRetry();
-        bool blocked(){
-          return _blocked;
-        }
+        bool blocked() { return _blocked; }
 
       protected:
         virtual bool recvTimingResp(PacketPtr pkt);

From 4541367e7f3091feb30a81c403cbdd9d1d1e9b0b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 13:44:39 -0800
Subject: [PATCH 028/247] Bug fix.

---
 src/accl/apply.cc       | 12 ------------
 src/accl/apply.hh       | 34 +++++++++++++++-------------------
 src/accl/push_engine.cc |  2 +-
 src/accl/push_engine.hh |  2 +-
 src/accl/wl_engine.hh   |  2 +-
 5 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 410eff5268..b493d3d152 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -75,12 +75,6 @@ bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
     return true;
 }
 
-void
-Apply::ApplyRespPort::trySendRetry()
-{
-    sendRetryReq();
-}
-
 void
 Apply::ApplyRespPort::recvFunctional(PacketPtr pkt)
 {
@@ -116,12 +110,6 @@ Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
     }
 }
 
-void
-Apply::ApplyMemPort::trySendRetry()
-{
-    sendRetryResp();
-}
-
 void
 Apply::ApplyMemPort::recvReqRetry()
 {
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index f08c1fef85..6ab639c552 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -64,7 +64,7 @@ class Apply : public ClockedObject
       }
 
       PacketPtr pop(){
-        return applyQueue->pop();
+        return applyQueue.pop();
       }
 
       PacketPtr front(){
@@ -79,16 +79,12 @@ class Apply : public ClockedObject
     {
       private:
         Apply *owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
 
       public:
-        void trySendRetry();
-        virtual AddrRangeList getAddrRanges();
         ApplyRespPort(const std::string& name, Apply* owner):
-          ResponsePort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
+          ResponsePort(name, owner), owner(owner)
         {}
+        virtual AddrRangeList getAddrRanges() const;
 
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);
@@ -140,16 +136,24 @@ class Apply : public ClockedObject
         void recvReqRetry() override;
     };
 
+    System* const system;
+    const RequestorID requestorId;
+
     ApplyMemPort memPort;
     ApplyRespPort respPort;
     ApplyReqPort reqPort;
 
+    ApplyQueue applyReadQueue;
+    ApplyQueue applyWriteQueue;
+
+    std::unordered_map<RequestPtr, int> requestOffset;
+
     bool handleWL(PacketPtr pkt);
-    bool sendPacket();
-    //one queue for write and one for read a priotizes write over read
-    void readApplyBuffer();
+    // bool sendPacket();
+    // //one queue for write and one for read a priotizes write over read
+    // void readApplyBuffer();
     bool handleMemResp(PacketPtr resp);
-    void writePushBuffer();
+    // void writePushBuffer();
 
     //Events
     void processNextApplyCheckEvent();
@@ -166,16 +170,8 @@ class Apply : public ClockedObject
        Write edgelist loc in buffer
     */
 
-    System* const system;
-    const RequestorID requestorId;
-
     AddrRangeList getAddrRanges() const;
 
-    ApplyQueue applyReadQueue;
-    ApplyQueue applyWriteQueue;
-
-    std::unordered_map<RequestPtr, int> requestOffset;
-
   public:
     Apply(const ApplyParams &apply);
 
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index bf385818f5..fde79a5aa7 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -99,7 +99,7 @@ PushEngine::startup()
 }
 
 AddrRangeList
-PushEngine::PushRespPort::getAddrRanges()
+PushEngine::PushRespPort::getAddrRanges() const
 {
     return owner->getAddrRanges();
 }
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index ea9026ff8f..fbb7d6915a 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -54,7 +54,7 @@ class PushEngine : public ClockedObject
         PushRespPort(const std::string& name, PushEngine* owner):
           ResponsePort(name, owner), owner(owner)
         {}
-        virtual AddrRangeList getAddrRanges();
+        virtual AddrRangeList getAddrRanges() const;
 
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 94ac7c7aff..504b63bc46 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -87,7 +87,7 @@ class WLEngine : public ClockedObject
         WLRespPort(const std::string& name, WLEngine* owner):
           ResponsePort(name, owner), owner(owner)
         {}
-        virtual AddrRangeList getAddrRanges();
+        virtual AddrRangeList getAddrRanges() const;
 
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);

From eb31d031f86ed681b6e974aeda16456daf0e67ef Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 15 Feb 2022 17:02:08 -0800
Subject: [PATCH 029/247] Apply engine compiles

---
 src/accl/apply.cc | 33 +++++++++++++++++++++++++++------
 src/accl/apply.hh | 45 ++++++++++++++++++++++-----------------------
 2 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index b493d3d152..55288693f3 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -41,10 +41,12 @@ Apply::Apply(const ApplyParams &params):
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
     nextApplyEvent([this]{ processNextApplyEvent(); }, name()),
-    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name())
+    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
+    applyReadQueue(params.applyQueueSize),
+    applyWriteQueue(params.applyQueueSize)
 {
-    applyReadQueue(params.applyQueueSize);
-    applyWriteQueue(params.applyQueueSize);
+    // applyReadQueue(params.applyQueueSize);
+    // applyWriteQueue(params.applyQueueSize);
 }
 
 Port &
@@ -62,7 +64,7 @@ Apply::getPort(const std::string &if_name, PortID idx)
 }
 
 AddrRangeList
-Apply::ApplyRespPort::getAddrRanges()
+Apply::ApplyRespPort::getAddrRanges() const
 {
     return owner->getAddrRanges();
 }
@@ -93,6 +95,12 @@ Apply::ApplyRespPort::recvRespRetry()
     panic("recvRespRetry from response port is called.");
 }
 
+void
+Apply::ApplyRespPort::trySendRetry()
+{
+    sendRetryReq();
+}
+
 bool
 Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
 {
@@ -118,6 +126,12 @@ Apply::ApplyMemPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
+void
+Apply::ApplyMemPort::trySendRetry()
+{
+    sendRetryResp();
+}
+
 void
 Apply::ApplyReqPort::sendPacket(PacketPtr pkt)
 {
@@ -135,6 +149,12 @@ Apply::ApplyReqPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
+bool
+Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvRespRetry from response port is called.");
+}
+
 AddrRangeList
 Apply::getAddrRanges() const
 {
@@ -158,7 +178,8 @@ bool Apply::handleWL(PacketPtr pkt){
 void Apply::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
     if (!memPort.blocked()){
-        auto pkt = queue.pop();
+        PacketPtr pkt = queue.front();
+        queue.pop();
         if (queue.sendPktRetry && !queue.blocked()){
                 respPort.trySendRetry();
                 queue.sendPktRetry = false;
@@ -229,7 +250,7 @@ Apply::processNextApplyEvent(){
                 }
             }
         }else{
-            queue.pop();
+            queue.applyQueue.pop();
             if (queue.sendPktRetry && !queue.blocked()){
                 memPort.trySendRetry();
                 queue.sendPktRetry = false;
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index 6ab639c552..7f17e173c6 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -63,8 +63,8 @@ class Apply : public ClockedObject
         applyQueue.push(pkt);
       }
 
-      PacketPtr pop(){
-        return applyQueue.pop();
+      void pop(){
+        applyQueue.pop();
       }
 
       PacketPtr front(){
@@ -72,20 +72,20 @@ class Apply : public ClockedObject
       }
 
       ApplyQueue(uint32_t qSize):
-        queueSize(qSize){}
+        queueSize(qSize),
+        sendPktRetry(false){}
     };
 
     class ApplyRespPort : public ResponsePort
     {
       private:
         Apply *owner;
-
       public:
         ApplyRespPort(const std::string& name, Apply* owner):
           ResponsePort(name, owner), owner(owner)
         {}
         virtual AddrRangeList getAddrRanges() const;
-
+        void trySendRetry();
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);
         virtual Tick recvAtomic(PacketPtr pkt);
@@ -105,7 +105,6 @@ class Apply : public ClockedObject
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
-
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
 
@@ -139,9 +138,24 @@ class Apply : public ClockedObject
     System* const system;
     const RequestorID requestorId;
 
-    ApplyMemPort memPort;
-    ApplyRespPort respPort;
     ApplyReqPort reqPort;
+    ApplyRespPort respPort;
+    ApplyMemPort memPort;
+
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
+    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
+       Perform apply and send the write request and read edgeList
+       read + write
+       Write edgelist loc in buffer
+    */
+
+    EventFunctionWrapper nextApplyCheckEvent;
+    void processNextApplyCheckEvent();
+    /* Syncronously checked
+       If there are any active vertecies:
+       create memory read packets + MPU::MPU::MemPortsendTimingReq
+    */
 
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
@@ -155,21 +169,6 @@ class Apply : public ClockedObject
     bool handleMemResp(PacketPtr resp);
     // void writePushBuffer();
 
-    //Events
-    void processNextApplyCheckEvent();
-    EventFunctionWrapper nextApplyCheckEvent;
-    /* Syncronously checked
-       If there are any active vertecies:
-       create memory read packets + MPU::MPU::MemPortsendTimingReq
-    */
-    void processNextApplyEvent();
-    EventFunctionWrapper nextApplyEvent;
-    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
-       Perform apply and send the write request and read edgeList
-       read + write
-       Write edgelist loc in buffer
-    */
-
     AddrRangeList getAddrRanges() const;
 
   public:

From e3a7f1c1d727c2497e10003d781f404771345a5b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 17:22:29 -0800
Subject: [PATCH 030/247] Bug fix. Very close to first compilation.

---
 src/accl/apply.cc       |  60 +++++++------------
 src/accl/apply.hh       |  87 ++++++++++++++-------------
 src/accl/push_engine.cc | 126 ++++++++++++++++++++++------------------
 src/accl/util.hh        |  14 +++++
 src/accl/wl_engine.cc   |  22 ++-----
 src/accl/wl_engine.hh   |   3 +-
 6 files changed, 153 insertions(+), 159 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 55288693f3..9c3d3f1c3d 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -30,6 +30,8 @@
 
 #include <string>
 
+#include "accl/util.hh"
+
 namespace gem5
 {
 
@@ -37,17 +39,14 @@ Apply::Apply(const ApplyParams &params):
     ClockedObject(params),
     system(params.system),
     requestorId(system->getRequestorId(this)),
-    reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
+    reqPort(name() + ".reqPort", this),
     memPort(name() + ".memPort", this),
-    nextApplyEvent([this]{ processNextApplyEvent(); }, name()),
-    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
     applyReadQueue(params.applyQueueSize),
-    applyWriteQueue(params.applyQueueSize)
-{
-    // applyReadQueue(params.applyQueueSize);
-    // applyWriteQueue(params.applyQueueSize);
-}
+    applyWriteQueue(params.applyQueueSize),
+    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
+    nextApplyEvent([this]{ processNextApplyEvent(); }, name())
+{}
 
 Port &
 Apply::getPort(const std::string &if_name, PortID idx)
@@ -96,22 +95,8 @@ Apply::ApplyRespPort::recvRespRetry()
 }
 
 void
-Apply::ApplyRespPort::trySendRetry()
-{
-    sendRetryReq();
-}
-
-bool
-Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
-
-void
-Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
+Apply::ApplyReqPort::sendPacket(PacketPtr pkt)
 {
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
         _blocked = true;
@@ -119,30 +104,27 @@ Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
 }
 
 void
-Apply::ApplyMemPort::recvReqRetry()
+Apply::ApplyReqPort::recvReqRetry()
 {
     _blocked = false;
     sendPacket(blockedPacket);
     blockedPacket = nullptr;
 }
 
-void
-Apply::ApplyMemPort::trySendRetry()
+bool
+Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt)
 {
-    sendRetryResp();
+    panic("recvTimingResp called on reqPort.");
 }
 
-void
-Apply::ApplyReqPort::sendPacket(PacketPtr pkt)
+bool
+Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
 {
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
+    return owner->handleMemResp(pkt);
 }
 
 void
-Apply::ApplyReqPort::recvReqRetry()
+Apply::ApplyMemPort::recvReqRetry()
 {
     _blocked = false;
     sendPacket(blockedPacket);
@@ -179,9 +161,8 @@ void Apply::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
     if (!memPort.blocked()){
         PacketPtr pkt = queue.front();
-        queue.pop();
         if (queue.sendPktRetry && !queue.blocked()){
-                respPort.trySendRetry();
+                // respPort.trySendRetry();
                 queue.sendPktRetry = false;
         }
         // conver to ReadReq
@@ -190,7 +171,8 @@ void Apply::processNextApplyCheckEvent(){
         RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
         PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
         requestOffset[request] = req_offset;
-        memPort.sendPacket(memPkt);
+        memPort.sendPacke:(memPkt);
+        queue.pop();
     }
     if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
@@ -245,14 +227,14 @@ Apply::processNextApplyEvent(){
                 reqPort.sendPacket(writePkt);
                 queue.pop();
                 if (queue.sendPktRetry && !queue.blocked()){
-                    memPort.trySendRetry();
+                    // memPort.trySendRetry();
                     queue.sendPktRetry = false;
                 }
             }
         }else{
             queue.applyQueue.pop();
             if (queue.sendPktRetry && !queue.blocked()){
-                memPort.trySendRetry();
+                // memPort.trySendRetry();
                 queue.sendPktRetry = false;
             }
         }
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index 7f17e173c6..2a16632e22 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -32,7 +32,6 @@
 #include <queue>
 #include <unordered_map>
 
-#include "accl/util.hh"
 #include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
@@ -49,31 +48,31 @@ class Apply : public ClockedObject
   private:
 
     struct ApplyQueue{
-      std::queue<PacketPtr> applyQueue;
-      const uint32_t queueSize;
-      bool sendPktRetry;
-
-      bool blocked(){
-        return (applyQueue.size() == queueSize);
-      }
-      bool empty(){
-        return applyQueue.empty();
-      }
-      void push(PacketPtr pkt){
-        applyQueue.push(pkt);
-      }
-
-      void pop(){
-        applyQueue.pop();
-      }
-
-      PacketPtr front(){
-        return applyQueue.front();
-      }
-
-      ApplyQueue(uint32_t qSize):
-        queueSize(qSize),
-        sendPktRetry(false){}
+        std::queue<PacketPtr> applyQueue;
+        const uint32_t queueSize;
+        bool sendPktRetry;
+
+        bool blocked(){
+            return (applyQueue.size() == queueSize);
+        }
+        bool empty(){
+            return applyQueue.empty();
+        }
+        void push(PacketPtr pkt){
+            applyQueue.push(pkt);
+        }
+
+        void pop(){
+            applyQueue.pop();
+        }
+
+        PacketPtr front(){
+            return applyQueue.front();
+        }
+
+        ApplyQueue(uint32_t qSize):
+          queueSize(qSize)
+        {}
     };
 
     class ApplyRespPort : public ResponsePort
@@ -109,8 +108,8 @@ class Apply : public ClockedObject
         bool blocked() { return _blocked; }
 
       protected:
-        void recvReqRetry() override;
         virtual bool recvTimingResp(PacketPtr pkt);
+        void recvReqRetry() override;
     };
 
     class ApplyMemPort : public RequestPort
@@ -127,7 +126,7 @@ class Apply : public ClockedObject
         {}
 
         void sendPacket(PacketPtr pkt);
-        void trySendRetry();
+        // void trySendRetry();
         bool blocked(){ return _blocked;}
 
       protected:
@@ -138,25 +137,10 @@ class Apply : public ClockedObject
     System* const system;
     const RequestorID requestorId;
 
-    ApplyReqPort reqPort;
     ApplyRespPort respPort;
+    ApplyReqPort reqPort;
     ApplyMemPort memPort;
 
-    EventFunctionWrapper nextApplyEvent;
-    void processNextApplyEvent();
-    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
-       Perform apply and send the write request and read edgeList
-       read + write
-       Write edgelist loc in buffer
-    */
-
-    EventFunctionWrapper nextApplyCheckEvent;
-    void processNextApplyCheckEvent();
-    /* Syncronously checked
-       If there are any active vertecies:
-       create memory read packets + MPU::MPU::MemPortsendTimingReq
-    */
-
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
 
@@ -169,6 +153,21 @@ class Apply : public ClockedObject
     bool handleMemResp(PacketPtr resp);
     // void writePushBuffer();
 
+    //Events
+    EventFunctionWrapper nextApplyCheckEvent;
+    void processNextApplyCheckEvent();
+    /* Syncronously checked
+       If there are any active vertecies:
+       create memory read packets + MPU::MPU::MemPortsendTimingReq
+    */
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
+    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
+       Perform apply and send the write request and read edgeList
+       read + write
+       Write edgelist loc in buffer
+    */
+
     AddrRangeList getAddrRanges() const;
 
   public:
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index fde79a5aa7..125433653b 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -26,9 +26,10 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/util.hh"
 #include "accl/push_engine.hh"
 
+#include "accl/util.hh"
+
 namespace gem5
 {
 
@@ -128,6 +129,68 @@ PushEngine::PushRespPort::recvRespRetry()
     panic("recvRespRetry from response port is called.");
 }
 
+void
+PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+PushEngine::PushReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+PushEngine::PushReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+bool
+PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return owner->handleMemResp(pkt);
+}
+
+void
+PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+PushEngine::PushMemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
 AddrRangeList
 PushEngine::getAddrRanges()
 {
@@ -224,24 +287,8 @@ void PushEngine::processNextReadEvent()
     }
 }
 
-bool PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
-
-void
-PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool PushEngine::handleMemResp(PacketPtr pkt)
+bool
+PushEngine::handleMemResp(PacketPtr pkt)
 {
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
@@ -259,7 +306,8 @@ bool PushEngine::handleMemResp(PacketPtr pkt)
         // TODO: Implement propagate function here
         *update_data = value + 1;
         PacketPtr update = getUpdatePacket(e.neighbor,
-            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
+            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
+            requestorId);
         updateQueue.push(update);
     }
 
@@ -286,42 +334,4 @@ void PushEngine::processNextSendEvent()
     }
 }
 
-void
-PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-void
-PushEngine::PushReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
-void
-PushEngine::PushMemPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
 }
diff --git a/src/accl/util.hh b/src/accl/util.hh
index 91692488a4..b3cff93f15 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -26,6 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "base/cprintf.hh"
 #include "base/types.hh"
 #include "mem/packet.hh"
 #include "mem/request.hh"
@@ -39,12 +40,25 @@ struct WorkListItem
     uint32_t prop;
     uint32_t degree;
     uint32_t edgeIndex;
+
+    std::string to_string()
+    {
+        return csprintf(
+        "WorkListItem{temp_prop: %u, prop: %u, degree: %u, edgeIndex: %u}",
+        temp_prop, prop, degree, edgeIndex);
+    }
+
 };
 
 struct Edge
 {
     uint64_t weight;
     Addr neighbor;
+
+    std::string to_string()
+    {
+        return csprintf("Edge{weight: %lu, neighbor: %lu}", weight, neighbor);
+    }
 };
 
 WorkListItem memoryToWorkList(uint8_t* data);
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 98c940a2de..eb883cb19b 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -30,6 +30,8 @@
 
 #include <string>
 
+#include "accl/util.hh"
+
 namespace gem5
 {
 
@@ -76,12 +78,6 @@ bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
     return true;
 }
 
-void
-WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
 Tick
 WLEngine::WLRespPort::recvAtomic(PacketPtr pkt)
 {
@@ -125,12 +121,6 @@ WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
     return owner->handleMemResp(pkt);
 }
 
-void
-WLEngine::WLMemPort::trySendRetry()
-{
-    sendRetryResp();
-}
-
 void
 WLEngine::WLReqPort::recvReqRetry()
 {
@@ -244,12 +234,12 @@ WLEngine::processNextWLReduceEvent(){
             applyPort.sendPacket(writePkt);
             queue.pop();
             if (!queue.blocked() && queue.sendPktRetry){
-                memPort.trySendRetry();
+                // memPort.trySendRetry();
                 queue.sendPktRetry = false;
             }
             updateQ.pop();
             if (!updateQ.blocked() & updateQ.sendPktRetry){
-                respPort.trySendRetry();
+                // respPort.trySendRetry();
                 updateQ.sendPktRetry = false;
             }
         }
@@ -257,12 +247,12 @@ WLEngine::processNextWLReduceEvent(){
     else{
         queue.pop();
         if (!queue.blocked() && queue.sendPktRetry){
-            memPort.trySendRetry();
+            // memPort.trySendRetry();
             queue.sendPktRetry = false;
         }
         updateQ.pop();
         if (!updateQ.blocked() & updateQ.sendPktRetry){
-            respPort.trySendRetry();
+            // respPort.trySendRetry();
             updateQ.sendPktRetry = false;
         }
 
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 504b63bc46..ee25154caa 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -32,7 +32,6 @@
 #include <queue>
 #include <unordered_map>
 
-#include "accl/util.hh"
 #include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
@@ -140,8 +139,8 @@ class WLEngine : public ClockedObject
     const uint32_t queueSize;
     const RequestorID requestorId;
 
-    WLReqPort reqPort;
     WLRespPort respPort;
+    WLReqPort reqPort;
     WLMemPort memPort;
 
     bool handleWLU(PacketPtr pkt);

From 099f68905a083c566dcb1334b9c1b4fae3c1edcf Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 17:46:20 -0800
Subject: [PATCH 031/247] More bug fixes.

---
 src/accl/apply.cc     |  8 +-------
 src/accl/util.hh      |  3 ++-
 src/accl/wl_engine.cc | 12 +++++-------
 src/accl/wl_engine.hh | 18 ++++++------------
 4 files changed, 14 insertions(+), 27 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 9c3d3f1c3d..b18c990da2 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -131,12 +131,6 @@ Apply::ApplyMemPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
-bool
-Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvRespRetry from response port is called.");
-}
-
 AddrRangeList
 Apply::getAddrRanges() const
 {
@@ -171,7 +165,7 @@ void Apply::processNextApplyCheckEvent(){
         RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
         PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
         requestOffset[request] = req_offset;
-        memPort.sendPacke:(memPkt);
+        memPort.sendPacket(memPkt);
         queue.pop();
     }
     if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
diff --git a/src/accl/util.hh b/src/accl/util.hh
index b3cff93f15..a4418a1cb8 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -71,6 +71,7 @@ PacketPtr getReadPacket(Addr addr, unsigned int size,
                             RequestorID requestorId);
 PacketPtr getWritePacket(Addr addr, unsigned int size,
                 uint8_t* data, RequestorID requestorId);
-PacketPtr getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
+PacketPtr getUpdatePacket(Addr addr, unsigned int size,
+                uint8_t *data, RequestorID requestorId);
 
 }
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index eb883cb19b..614f34d175 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -38,17 +38,15 @@ namespace gem5
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
     system(params.system),
-    queueSize(params.wlQueueSize),
     requestorId(system->getRequestorId(this)),
-    reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
+    reqPort(name() + ".reqPort", this),
     memPort(name() + ".memPort", this),
+    updateQueue(params.wlQueueSize),
+    responseQueue(params.wlQueueSize),
     nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()),
-    nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name()),
-    updateQueue(queueSize),
-    responseQueue(queueSize)
-{
-}
+    nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name())
+{}
 
 Port &
 WLEngine::getPort(const std::string &if_name, PortID idx)
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index ee25154caa..57cc063880 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -136,26 +136,26 @@ class WLEngine : public ClockedObject
     };
 
     System* const system;
-    const uint32_t queueSize;
     const RequestorID requestorId;
 
     WLRespPort respPort;
     WLReqPort reqPort;
     WLMemPort memPort;
 
-    bool handleWLU(PacketPtr pkt);
-    bool sendPacket();
-    //one queue for write and one for read a priotizes write over read
-    void readWLBuffer();
+    WLQueue updateQueue;
+    WLQueue responseQueue;
 
+    std::unordered_map<RequestPtr, int> requestOffset;
 
     //Events
+    bool handleWLUpdate(PacketPtr pkt);
     EventFunctionWrapper nextWLReadEvent;
     void processNextWLReadEvent();
     /* Syncronously checked
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
+    bool handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextWLReduceEvent;
     void processNextWLReduceEvent();
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
@@ -164,14 +164,8 @@ class WLEngine : public ClockedObject
        Write edgelist loc in buffer
     */
 
-    std::unordered_map<RequestPtr, int> requestOffset;
-
-    WLQueue updateQueue;
-    WLQueue responseQueue;
-
     AddrRangeList getAddrRanges() const;
-    bool handleWLUpdate(PacketPtr pkt);
-    bool handleMemResp(PacketPtr resp);
+
     void recvFunctional(PacketPtr pkt);
 
    public:

From 793d75564e15d66b6d8e81f2a75dfd324465eb41 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 17:53:21 -0800
Subject: [PATCH 032/247] Compilation. yeay.

---
 src/accl/apply.cc     |  9 +++++++++
 src/accl/wl_engine.cc | 40 +++++++++++++++++++++++-----------------
 2 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index b18c990da2..40002c5264 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -117,6 +117,15 @@ Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt)
     panic("recvTimingResp called on reqPort.");
 }
 
+void
+Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
 bool
 Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
 {
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 614f34d175..d2ecd0d7c9 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -94,17 +94,14 @@ WLEngine::WLRespPort::recvRespRetry()
     panic("recvRespRetry from response port is called.");
 }
 
-void
-WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
+bool
+WLEngine::WLReqPort::recvTimingResp(PacketPtr)
 {
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
+    panic("recvTimingResp called on the request port.");
 }
 
 void
-WLEngine::WLMemPort::recvReqRetry()
+WLEngine::WLReqPort::recvReqRetry()
 {
     // We should have a blocked packet if this function is called.
     assert(_blocked && blockedPacket != nullptr);
@@ -113,14 +110,26 @@ WLEngine::WLMemPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
-bool
-WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
+void
+WLEngine::WLReqPort::sendPacket(PacketPtr pkt)
 {
-    return owner->handleMemResp(pkt);
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
 }
 
 void
-WLEngine::WLReqPort::recvReqRetry()
+WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+WLEngine::WLMemPort::recvReqRetry()
 {
     // We should have a blocked packet if this function is called.
     assert(_blocked && blockedPacket != nullptr);
@@ -129,13 +138,10 @@ WLEngine::WLReqPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
-void
-WLEngine::WLReqPort::sendPacket(PacketPtr pkt)
+bool
+WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
 {
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
+    return owner->handleMemResp(pkt);
 }
 
 AddrRangeList

From 5e05fe3d6caa51cada748e2dc6e2e200c84932c7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 16 Feb 2022 10:31:28 -0800
Subject: [PATCH 033/247] Fixing a typo.

---
 src/accl/PushEngine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/PushEngine.py b/src/accl/PushEngine.py
index 3215fdaee2..840d8dea1f 100644
--- a/src/accl/PushEngine.py
+++ b/src/accl/PushEngine.py
@@ -30,7 +30,7 @@
 from m5.objects.ClockedObject import ClockedObject
 
 class PushEngine(ClockedObject):
-    type = 'WLEngine'
+    type = 'PushEngine'
     cxx_header = "accl/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 

From f35e40e74c7b42f5cd3ffc68b89ef2a714f5dab9 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 18 Feb 2022 14:08:41 -0800
Subject: [PATCH 034/247] Restructuring the directory.

---
 src/accl/{ => graph/base}/Apply.py       |  0
 src/accl/{ => graph/base}/PushEngine.py  |  0
 src/accl/{ => graph/base}/SConscript     |  0
 src/accl/{ => graph/base}/WLEngine.py    |  0
 src/accl/{ => graph/base}/apply.cc       | 73 +--------------------
 src/accl/{ => graph/base}/apply.hh       | 44 +------------
 src/accl/{ => graph/base}/push_engine.cc |  0
 src/accl/{ => graph/base}/push_engine.hh |  0
 src/accl/{ => graph/base}/util.cc        |  0
 src/accl/{ => graph/base}/util.hh        |  0
 src/accl/{ => graph/base}/wl_engine.cc   | 83 +-----------------------
 src/accl/{ => graph/base}/wl_engine.hh   | 49 +-------------
 src/accl/graph/sega/mpu.hh               |  0
 13 files changed, 7 insertions(+), 242 deletions(-)
 rename src/accl/{ => graph/base}/Apply.py (100%)
 rename src/accl/{ => graph/base}/PushEngine.py (100%)
 rename src/accl/{ => graph/base}/SConscript (100%)
 rename src/accl/{ => graph/base}/WLEngine.py (100%)
 rename src/accl/{ => graph/base}/apply.cc (80%)
 rename src/accl/{ => graph/base}/apply.hh (78%)
 rename src/accl/{ => graph/base}/push_engine.cc (100%)
 rename src/accl/{ => graph/base}/push_engine.hh (100%)
 rename src/accl/{ => graph/base}/util.cc (100%)
 rename src/accl/{ => graph/base}/util.hh (100%)
 rename src/accl/{ => graph/base}/wl_engine.cc (79%)
 rename src/accl/{ => graph/base}/wl_engine.hh (75%)
 create mode 100644 src/accl/graph/sega/mpu.hh

diff --git a/src/accl/Apply.py b/src/accl/graph/base/Apply.py
similarity index 100%
rename from src/accl/Apply.py
rename to src/accl/graph/base/Apply.py
diff --git a/src/accl/PushEngine.py b/src/accl/graph/base/PushEngine.py
similarity index 100%
rename from src/accl/PushEngine.py
rename to src/accl/graph/base/PushEngine.py
diff --git a/src/accl/SConscript b/src/accl/graph/base/SConscript
similarity index 100%
rename from src/accl/SConscript
rename to src/accl/graph/base/SConscript
diff --git a/src/accl/WLEngine.py b/src/accl/graph/base/WLEngine.py
similarity index 100%
rename from src/accl/WLEngine.py
rename to src/accl/graph/base/WLEngine.py
diff --git a/src/accl/apply.cc b/src/accl/graph/base/apply.cc
similarity index 80%
rename from src/accl/apply.cc
rename to src/accl/graph/base/apply.cc
index 40002c5264..eae9c2fd16 100644
--- a/src/accl/apply.cc
+++ b/src/accl/graph/base/apply.cc
@@ -30,17 +30,13 @@
 
 #include <string>
 
-#include "accl/util.hh"
+#include "accl/graph/base/util.hh"
 
 namespace gem5
 {
 
 Apply::Apply(const ApplyParams &params):
     ClockedObject(params),
-    system(params.system),
-    requestorId(system->getRequestorId(this)),
-    respPort(name() + ".respPort", this),
-    reqPort(name() + ".reqPort", this),
     memPort(name() + ".memPort", this),
     applyReadQueue(params.applyQueueSize),
     applyWriteQueue(params.applyQueueSize),
@@ -51,72 +47,13 @@ Apply::Apply(const ApplyParams &params):
 Port &
 Apply::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "reqPort") {
-        return reqPort;
-    } else if (if_name == "respPort") {
-        return respPort;
-    } else if (if_name == "memPort") {
+    if (if_name == "memPort") {
         return memPort;
     } else {
         return SimObject::getPort(if_name, idx);
     }
 }
 
-AddrRangeList
-Apply::ApplyRespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
-{
-    if (!owner->handleWL(pkt)){
-        return false;
-    }
-    return true;
-}
-
-void
-Apply::ApplyRespPort::recvFunctional(PacketPtr pkt)
-{
-    panic("Not implemented");
-}
-
-Tick
-Apply::ApplyRespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-Apply::ApplyRespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-void
-Apply::ApplyReqPort::sendPacket(PacketPtr pkt)
-{
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-void
-Apply::ApplyReqPort::recvReqRetry()
-{
-    _blocked = false;
-    sendPacket(blockedPacket);
-    blockedPacket = nullptr;
-}
-
-bool
-Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on reqPort.");
-}
-
 void
 Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
 {
@@ -140,12 +77,6 @@ Apply::ApplyMemPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
-AddrRangeList
-Apply::getAddrRanges() const
-{
-    return memPort.getAddrRanges();
-}
-
 bool Apply::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
     if (queue.blocked()){
diff --git a/src/accl/apply.hh b/src/accl/graph/base/apply.hh
similarity index 78%
rename from src/accl/apply.hh
rename to src/accl/graph/base/apply.hh
index 2a16632e22..a3f0ff5aa3 100644
--- a/src/accl/apply.hh
+++ b/src/accl/graph/base/apply.hh
@@ -46,7 +46,7 @@ namespace gem5
 class Apply : public ClockedObject
 {
   private:
-
+    //FIXME: Remove queue defenition from here.
     struct ApplyQueue{
         std::queue<PacketPtr> applyQueue;
         const uint32_t queueSize;
@@ -75,43 +75,6 @@ class Apply : public ClockedObject
         {}
     };
 
-    class ApplyRespPort : public ResponsePort
-    {
-      private:
-        Apply *owner;
-      public:
-        ApplyRespPort(const std::string& name, Apply* owner):
-          ResponsePort(name, owner), owner(owner)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-        void trySendRetry();
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    class ApplyReqPort : public RequestPort
-    {
-      private:
-        Apply *owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        ApplyReqPort(const std::string& name, Apply* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        void recvReqRetry() override;
-    };
-
     class ApplyMemPort : public RequestPort
     {
       private:
@@ -134,11 +97,8 @@ class Apply : public ClockedObject
         void recvReqRetry() override;
     };
 
-    System* const system;
     const RequestorID requestorId;
 
-    ApplyRespPort respPort;
-    ApplyReqPort reqPort;
     ApplyMemPort memPort;
 
     ApplyQueue applyReadQueue;
@@ -168,8 +128,6 @@ class Apply : public ClockedObject
        Write edgelist loc in buffer
     */
 
-    AddrRangeList getAddrRanges() const;
-
   public:
     Apply(const ApplyParams &apply);
 
diff --git a/src/accl/push_engine.cc b/src/accl/graph/base/push_engine.cc
similarity index 100%
rename from src/accl/push_engine.cc
rename to src/accl/graph/base/push_engine.cc
diff --git a/src/accl/push_engine.hh b/src/accl/graph/base/push_engine.hh
similarity index 100%
rename from src/accl/push_engine.hh
rename to src/accl/graph/base/push_engine.hh
diff --git a/src/accl/util.cc b/src/accl/graph/base/util.cc
similarity index 100%
rename from src/accl/util.cc
rename to src/accl/graph/base/util.cc
diff --git a/src/accl/util.hh b/src/accl/graph/base/util.hh
similarity index 100%
rename from src/accl/util.hh
rename to src/accl/graph/base/util.hh
diff --git a/src/accl/wl_engine.cc b/src/accl/graph/base/wl_engine.cc
similarity index 79%
rename from src/accl/wl_engine.cc
rename to src/accl/graph/base/wl_engine.cc
index d2ecd0d7c9..dc8f1dd744 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/graph/base/wl_engine.cc
@@ -26,21 +26,17 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/wl_engine.hh"
+#include "accl/graph/base/wl_engine.hh"
 
 #include <string>
 
-#include "accl/util.hh"
+#include "accl/graph/base/util.hh"
 
 namespace gem5
 {
 
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
-    system(params.system),
-    requestorId(system->getRequestorId(this)),
-    respPort(name() + ".respPort", this),
-    reqPort(name() + ".reqPort", this),
     memPort(name() + ".memPort", this),
     updateQueue(params.wlQueueSize),
     responseQueue(params.wlQueueSize),
@@ -51,74 +47,13 @@ WLEngine::WLEngine(const WLEngineParams &params):
 Port &
 WLEngine::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "reqPort") {
-        return reqPort;
-    } else if (if_name == "respPort") {
-        return respPort;
-    } else if (if_name == "memPort") {
+    if (if_name == "memPort") {
         return memPort;
     } else {
         return SimObject::getPort(if_name, idx);
     }
 }
 
-AddrRangeList
-WLEngine::WLRespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
-{
-    if (!owner->handleWLUpdate(pkt)){
-        return false;
-    }
-    return true;
-}
-
-Tick
-WLEngine::WLRespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-WLEngine::WLRespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-bool
-WLEngine::WLReqPort::recvTimingResp(PacketPtr)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-WLEngine::WLReqPort::recvReqRetry()
-{
-    // We should have a blocked packet if this function is called.
-    assert(_blocked && blockedPacket != nullptr);
-    _blocked = false;
-    sendPacket(blockedPacket);
-    blockedPacket = nullptr;
-}
-
-void
-WLEngine::WLReqPort::sendPacket(PacketPtr pkt)
-{
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
 void
 WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
 {
@@ -144,18 +79,6 @@ WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
     return owner->handleMemResp(pkt);
 }
 
-AddrRangeList
-WLEngine::getAddrRanges() const
-{
-    return memPort.getAddrRanges();
-}
-
-void
-WLEngine::recvFunctional(PacketPtr pkt)
-{
-    memPort.sendFunctional(pkt);
-}
-
 bool
 WLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
diff --git a/src/accl/wl_engine.hh b/src/accl/graph/base/wl_engine.hh
similarity index 75%
rename from src/accl/wl_engine.hh
rename to src/accl/graph/base/wl_engine.hh
index 57cc063880..3654999b70 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/graph/base/wl_engine.hh
@@ -46,7 +46,7 @@ namespace gem5
 class WLEngine : public ClockedObject
 {
   private:
-
+    //FIXME: Change this
     struct WLQueue{
       std::queue<PacketPtr> wlQueue;
       uint32_t queueSize;
@@ -77,44 +77,6 @@ class WLEngine : public ClockedObject
         sendPktRetry(false){}
     };
 
-    class WLRespPort : public ResponsePort //From Push engine
-    {
-      private:
-        WLEngine *owner;
-
-      public:
-        WLRespPort(const std::string& name, WLEngine* owner):
-          ResponsePort(name, owner), owner(owner)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    class WLReqPort : public RequestPort //To Apply Engine
-    {
-      private:
-        WLEngine *owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        WLReqPort(const std::string& name, WLEngine* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        void recvReqRetry() override;
-        virtual bool recvTimingResp(PacketPtr pkt);
-    };
-
     class WLMemPort : public RequestPort
     {
       private:
@@ -135,11 +97,6 @@ class WLEngine : public ClockedObject
         void recvReqRetry() override;
     };
 
-    System* const system;
-    const RequestorID requestorId;
-
-    WLRespPort respPort;
-    WLReqPort reqPort;
     WLMemPort memPort;
 
     WLQueue updateQueue;
@@ -164,10 +121,6 @@ class WLEngine : public ClockedObject
        Write edgelist loc in buffer
     */
 
-    AddrRangeList getAddrRanges() const;
-
-    void recvFunctional(PacketPtr pkt);
-
    public:
     WLEngine(const WLEngineParams &params);
 
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
new file mode 100644
index 0000000000..e69de29bb2

From d02f3824f8f6fc41ae6cff87bfccff497405d78a Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 09:59:09 -0800
Subject: [PATCH 035/247] Restructing the classes.

---
 src/accl/graph/base/Apply.py      |  5 +---
 src/accl/graph/base/PushEngine.py |  3 --
 src/accl/graph/base/WLEngine.py   |  5 +---
 src/accl/graph/sega/MPU.py        | 46 +++++++++++++++++++++++++++++++
 4 files changed, 48 insertions(+), 11 deletions(-)
 create mode 100644 src/accl/graph/sega/MPU.py

diff --git a/src/accl/graph/base/Apply.py b/src/accl/graph/base/Apply.py
index 8720287cc8..80aa430139 100644
--- a/src/accl/graph/base/Apply.py
+++ b/src/accl/graph/base/Apply.py
@@ -34,8 +34,5 @@ class Apply(ClockedObject):
     cxx_header = "accl/apply.hh"
     cxx_class = 'gem5::Apply'
 
-    system = Param.System(Parent.any, "The system object this apply engine is a part of")
-    respPort = ResponsePort("Receives requests from WorkList")
-    reqPort  = RequestPort("Sends requests to Push")
-    memPort  = RequestPort("Memory side port, sends requests")
     applyQueueSize = Param.Unsigned(32, "Size of write queue")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/graph/base/PushEngine.py b/src/accl/graph/base/PushEngine.py
index 840d8dea1f..7fef165169 100644
--- a/src/accl/graph/base/PushEngine.py
+++ b/src/accl/graph/base/PushEngine.py
@@ -34,7 +34,4 @@ class PushEngine(ClockedObject):
     cxx_header = "accl/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
-    system = Param.System(Parent.any, "The system object this push engine is a part of")
-    respPort = ResponsePort("Port to Receive updates from outside")
-    reqPort  = RequestPort("Port to send updates to the outside")
     memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/WLEngine.py b/src/accl/graph/base/WLEngine.py
index 562fd04423..deaee20935 100644
--- a/src/accl/graph/base/WLEngine.py
+++ b/src/accl/graph/base/WLEngine.py
@@ -34,8 +34,5 @@ class WLEngine(ClockedObject):
     cxx_header = "accl/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
-    system = Param.System(Parent.any, "The system object this push WorkList is a part of")
-    respPort = ResponsePort("Receives updates")
-    reqPort  = RequestPort("Sends requests to Apply")
-    memPort  = RequestPort("Memory side port, sends requests")
     wlQueueSize = Param.Unsigned(32, "Size of write queue")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
new file mode 100644
index 0000000000..b6e136dda5
--- /dev/null
+++ b/src/accl/graph/sega/MPU.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+# FIXME: update these to correct files
+from m5.objects.WLEngine import WLEngine
+from m5.objects.PushEngine import PushEngine
+from m5.objects.ApplyEngine import ApplyEngine
+
+class MPU(ClockedObject):
+    type = 'MPU'
+    cxx_header = "accl/graph/sega/mpu.hh"
+    cxx_class = 'gem5::MPU'
+
+    workListEngine = Param.WLEngine("WLEngine object to connect to "
+                    "This MPU")
+    applyEngine = Param.ApplyEngine("ApplyEngine object to connect to "
+                    "This MPU")
+    pushEngine = Param.PushEngine("PushEngine object to connect to "
+                    "This MPU")

From bfb12794aa99858bb88afab45640cc27c90bde76 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 11:12:50 -0800
Subject: [PATCH 036/247] Sperating WLEngine and BaseWLEngine + few changes in
 BaseApplyEngine

---
 .../base/{Apply.py => BaseApplyEngine.py}     |  0
 src/accl/graph/base/BaseWLEngine.py           | 38 ++++++++++++++++++
 .../base/{apply.cc => base_apply_engine.cc}   | 20 +++++-----
 .../base/{apply.hh => base_apply_engine.hh}   | 35 +++++-----------
 .../base/{wl_engine.cc => base_wl_engine.cc}  | 20 +++++-----
 .../base/{wl_engine.hh => base_wl_engine.hh}  | 13 +++---
 src/accl/graph/sega/ApplyEngine.py            | 40 +++++++++++++++++++
 src/accl/graph/{base => sega}/WLEngine.py     | 12 +++---
 src/accl/graph/sega/apply_engine.cc           |  0
 src/accl/graph/sega/apply_engine.hh           |  0
 src/accl/graph/sega/wl_engine.cc              |  0
 src/accl/graph/sega/wl_engine.hh              |  0
 12 files changed, 120 insertions(+), 58 deletions(-)
 rename src/accl/graph/base/{Apply.py => BaseApplyEngine.py} (100%)
 create mode 100644 src/accl/graph/base/BaseWLEngine.py
 rename src/accl/graph/base/{apply.cc => base_apply_engine.cc} (91%)
 rename src/accl/graph/base/{apply.hh => base_apply_engine.hh} (79%)
 rename src/accl/graph/base/{wl_engine.cc => base_wl_engine.cc} (91%)
 rename src/accl/graph/base/{wl_engine.hh => base_wl_engine.hh} (93%)
 create mode 100644 src/accl/graph/sega/ApplyEngine.py
 rename src/accl/graph/{base => sega}/WLEngine.py (84%)
 create mode 100644 src/accl/graph/sega/apply_engine.cc
 create mode 100644 src/accl/graph/sega/apply_engine.hh
 create mode 100644 src/accl/graph/sega/wl_engine.cc
 create mode 100644 src/accl/graph/sega/wl_engine.hh

diff --git a/src/accl/graph/base/Apply.py b/src/accl/graph/base/BaseApplyEngine.py
similarity index 100%
rename from src/accl/graph/base/Apply.py
rename to src/accl/graph/base/BaseApplyEngine.py
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
new file mode 100644
index 0000000000..7384e876ef
--- /dev/null
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseWLEngine(ClockedObject):
+    type = 'BaseWLEngine'
+    cxx_header = "accl/base_wl_engine.hh"
+    cxx_class = 'gem5::BaseWLEngine'
+
+    wlQueueSize = Param.Unsigned(32, "Size of write queue")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/graph/base/apply.cc b/src/accl/graph/base/base_apply_engine.cc
similarity index 91%
rename from src/accl/graph/base/apply.cc
rename to src/accl/graph/base/base_apply_engine.cc
index eae9c2fd16..c88d14a2c2 100644
--- a/src/accl/graph/base/apply.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/apply.hh"
+#include "accl/base_apply_engine.hh"
 
 #include <string>
 
@@ -35,7 +35,7 @@
 namespace gem5
 {
 
-Apply::Apply(const ApplyParams &params):
+BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     ClockedObject(params),
     memPort(name() + ".memPort", this),
     applyReadQueue(params.applyQueueSize),
@@ -45,7 +45,7 @@ Apply::Apply(const ApplyParams &params):
 {}
 
 Port &
-Apply::getPort(const std::string &if_name, PortID idx)
+BaseApplyEngine::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name == "memPort") {
         return memPort;
@@ -55,7 +55,7 @@ Apply::getPort(const std::string &if_name, PortID idx)
 }
 
 void
-Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
+BaseApplyEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
 {
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
@@ -64,20 +64,20 @@ Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
 }
 
 bool
-Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
+BaseApplyEngine::ApplyMemPort::recvTimingResp(PacketPtr pkt)
 {
     return owner->handleMemResp(pkt);
 }
 
 void
-Apply::ApplyMemPort::recvReqRetry()
+BaseApplyEngine::ApplyMemPort::recvReqRetry()
 {
     _blocked = false;
     sendPacket(blockedPacket);
     blockedPacket = nullptr;
 }
 
-bool Apply::handleWL(PacketPtr pkt){
+bool BaseApplyEngine::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
     if (queue.blocked()){
         queue.sendPktRetry = true;
@@ -91,7 +91,7 @@ bool Apply::handleWL(PacketPtr pkt){
     return true;
 }
 
-void Apply::processNextApplyCheckEvent(){
+void BaseApplyEngine::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
     if (!memPort.blocked()){
         PacketPtr pkt = queue.front();
@@ -114,7 +114,7 @@ void Apply::processNextApplyCheckEvent(){
 }
 
 bool
-Apply::handleMemResp(PacketPtr pkt)
+BaseApplyEngine::handleMemResp(PacketPtr pkt)
 {
     auto queue = applyWriteQueue;
 
@@ -132,7 +132,7 @@ Apply::handleMemResp(PacketPtr pkt)
 }
 
 void
-Apply::processNextApplyEvent(){
+BaseApplyEngine::processNextApplyEvent(){
     auto queue = applyWriteQueue;
         PacketPtr pkt = queue.front();
         uint8_t* data = pkt->getPtr<uint8_t>();
diff --git a/src/accl/graph/base/apply.hh b/src/accl/graph/base/base_apply_engine.hh
similarity index 79%
rename from src/accl/graph/base/apply.hh
rename to src/accl/graph/base/base_apply_engine.hh
index a3f0ff5aa3..c2d2f26387 100644
--- a/src/accl/graph/base/apply.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -32,18 +32,16 @@
 #include <queue>
 #include <unordered_map>
 
-#include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
-#include "params/Apply.hh"
+#include "params/BaseApplyEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
-#include "sim/system.hh"
 
 namespace gem5
 {
 
-class Apply : public ClockedObject
+class BaseApplyEngine : public ClockedObject
 {
   private:
     //FIXME: Remove queue defenition from here.
@@ -75,21 +73,20 @@ class Apply : public ClockedObject
         {}
     };
 
-    class ApplyMemPort : public RequestPort
+    class MemPort : public RequestPort
     {
       private:
-        Apply *owner;
+        BaseApplyEngine *owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
       public:
-        ApplyMemPort(const std::string& name, Apply* owner):
+        MemPort(const std::string& name, BaseApplyEngine* owner):
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
 
         void sendPacket(PacketPtr pkt);
-        // void trySendRetry();
         bool blocked(){ return _blocked;}
 
       protected:
@@ -99,7 +96,7 @@ class Apply : public ClockedObject
 
     const RequestorID requestorId;
 
-    ApplyMemPort memPort;
+    MemPort memPort;
 
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
@@ -107,29 +104,15 @@ class Apply : public ClockedObject
     std::unordered_map<RequestPtr, int> requestOffset;
 
     bool handleWL(PacketPtr pkt);
-    // bool sendPacket();
-    // //one queue for write and one for read a priotizes write over read
-    // void readApplyBuffer();
-    bool handleMemResp(PacketPtr resp);
-    // void writePushBuffer();
-
-    //Events
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
-    /* Syncronously checked
-       If there are any active vertecies:
-       create memory read packets + MPU::MPU::MemPortsendTimingReq
-    */
+
+    bool handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
-    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
-       Perform apply and send the write request and read edgeList
-       read + write
-       Write edgelist loc in buffer
-    */
 
   public:
-    Apply(const ApplyParams &apply);
+    BaseApplyEngine(const ApplyParams &apply);
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
diff --git a/src/accl/graph/base/wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
similarity index 91%
rename from src/accl/graph/base/wl_engine.cc
rename to src/accl/graph/base/base_wl_engine.cc
index dc8f1dd744..7261069c17 100644
--- a/src/accl/graph/base/wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/graph/base/wl_engine.hh"
+#include "accl/graph/base/base_wl_engine.hh"
 
 #include <string>
 
@@ -35,7 +35,7 @@
 namespace gem5
 {
 
-WLEngine::WLEngine(const WLEngineParams &params):
+BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
     ClockedObject(params),
     memPort(name() + ".memPort", this),
     updateQueue(params.wlQueueSize),
@@ -45,7 +45,7 @@ WLEngine::WLEngine(const WLEngineParams &params):
 {}
 
 Port &
-WLEngine::getPort(const std::string &if_name, PortID idx)
+BaseWLEngine::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name == "memPort") {
         return memPort;
@@ -55,7 +55,7 @@ WLEngine::getPort(const std::string &if_name, PortID idx)
 }
 
 void
-WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
+BaseWLEngine::WLMemPort::sendPacket(PacketPtr pkt)
 {
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
@@ -64,7 +64,7 @@ WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
 }
 
 void
-WLEngine::WLMemPort::recvReqRetry()
+BaseWLEngine::WLMemPort::recvReqRetry()
 {
     // We should have a blocked packet if this function is called.
     assert(_blocked && blockedPacket != nullptr);
@@ -74,13 +74,13 @@ WLEngine::WLMemPort::recvReqRetry()
 }
 
 bool
-WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
+BaseWLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
 {
     return owner->handleMemResp(pkt);
 }
 
 bool
-WLEngine::handleWLUpdate(PacketPtr pkt){
+BaseWLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
     if (queue.blocked()){
         queue.sendPktRetry = true;
@@ -94,7 +94,7 @@ WLEngine::handleWLUpdate(PacketPtr pkt){
     return true;
 }
 
-void WLEngine::processNextWLReadEvent(){
+void BaseWLEngine::processNextWLReadEvent(){
     auto queue = updateQueue;
     while (!queue.empty()){ //create a map instead of front
         PacketPtr pkt = queue.front();
@@ -117,7 +117,7 @@ void WLEngine::processNextWLReadEvent(){
 }
 
 bool
-WLEngine::handleMemResp(PacketPtr pkt)
+BaseWLEngine::handleMemResp(PacketPtr pkt)
 {
     auto queue = responseQueue;
         if (queue.blocked()){
@@ -134,7 +134,7 @@ WLEngine::handleMemResp(PacketPtr pkt)
 }
 
 void
-WLEngine::processNextWLReduceEvent(){
+BaseWLEngine::processNextWLReduceEvent(){
     auto queue = responseQueue;
     auto updateQ = updateQueue;
     auto applyPort = reqPort;
diff --git a/src/accl/graph/base/wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
similarity index 93%
rename from src/accl/graph/base/wl_engine.hh
rename to src/accl/graph/base/base_wl_engine.hh
index 3654999b70..2095a20f1b 100644
--- a/src/accl/graph/base/wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -35,7 +35,7 @@
 #include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
-#include "params/WLEngine.hh"
+#include "params/BaseWLEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
 #include "sim/system.hh"
@@ -43,7 +43,7 @@
 namespace gem5
 {
 
-class WLEngine : public ClockedObject
+class BaseWLEngine : public ClockedObject
 {
   private:
     //FIXME: Change this
@@ -77,7 +77,7 @@ class WLEngine : public ClockedObject
         sendPktRetry(false){}
     };
 
-    class WLMemPort : public RequestPort
+    class MemPort : public RequestPort
     {
       private:
         WLEngine *owner;
@@ -85,7 +85,7 @@ class WLEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        WLMemPort(const std::string& name, WLEngine* owner):
+        MemPort(const std::string& name, WLEngine* owner):
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
@@ -97,8 +97,7 @@ class WLEngine : public ClockedObject
         void recvReqRetry() override;
     };
 
-    WLMemPort memPort;
-
+    MemPort memPort;
     WLQueue updateQueue;
     WLQueue responseQueue;
 
@@ -122,7 +121,7 @@ class WLEngine : public ClockedObject
     */
 
    public:
-    WLEngine(const WLEngineParams &params);
+    BaseWLEngine(const BaseWLEngineParams &params);
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/ApplyEngine.py
new file mode 100644
index 0000000000..0d03e71e54
--- /dev/null
+++ b/src/accl/graph/sega/ApplyEngine.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from build.NULL.python.m5.proxy import Parent
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+# FIXME: update these to correct files
+from m5.objects.BaseApplyEngine import BaseApplyEngine
+
+class ApplyEngine(BaseApplyEngine):
+    type = 'ApplyEngine'
+    cxx_header = "accl/graph/sega/apply_engine.hh"
+    cxx_class = 'gem5::MPU'
+
+    mpu = Param.MPU(Parent, "MPU object that owns this ApplyEngine")
diff --git a/src/accl/graph/base/WLEngine.py b/src/accl/graph/sega/WLEngine.py
similarity index 84%
rename from src/accl/graph/base/WLEngine.py
rename to src/accl/graph/sega/WLEngine.py
index deaee20935..a8f3bd20ea 100644
--- a/src/accl/graph/base/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -25,14 +25,16 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from build.NULL.python.m5.proxy import Parent
 from m5.params import *
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
+# FIXME: update these to correct files
+from m5.objects.BaseWLEngine import BaseWLEngine
 
-class WLEngine(ClockedObject):
+class WLEngine(BaseWLEngine):
     type = 'WLEngine'
-    cxx_header = "accl/wl_engine.hh"
-    cxx_class = 'gem5::WLEngine'
+    cxx_header = "accl/graph/sega/wl_engine.hh"
+    cxx_class = 'gem5::MPU'
 
-    wlQueueSize = Param.Unsigned(32, "Size of write queue")
-    memPort  = RequestPort("Memory side port, sends requests")
+    mpu = Param.MPU(Parent, "MPU object that owns this WLEngine")
\ No newline at end of file
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
new file mode 100644
index 0000000000..e69de29bb2

From bfdec933f77713641144d1a2bd4fa1c4aec53faa Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 20 Feb 2022 11:25:17 -0800
Subject: [PATCH 037/247] Restructuring classes.

---
 src/accl/graph/base/BasePushEngine.py         |  37 ++++++
 src/accl/graph/base/SConscript                |   4 +-
 .../{push_engine.cc => base_push_engine.cc}   | 125 +++++-------------
 .../{push_engine.hh => base_push_engine.hh}   |  66 ++-------
 src/accl/graph/sega/MPU.py                    |   2 +-
 src/accl/graph/{base => sega}/PushEngine.py   |  14 +-
 src/accl/graph/sega/push_engine.cc            |   0
 src/accl/graph/sega/push_engine.hh            |   0
 8 files changed, 90 insertions(+), 158 deletions(-)
 create mode 100644 src/accl/graph/base/BasePushEngine.py
 rename src/accl/graph/base/{push_engine.cc => base_push_engine.cc} (77%)
 rename src/accl/graph/base/{push_engine.hh => base_push_engine.hh} (66%)
 rename src/accl/graph/{base => sega}/PushEngine.py (83%)
 create mode 100644 src/accl/graph/sega/push_engine.cc
 create mode 100644 src/accl/graph/sega/push_engine.hh

diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
new file mode 100644
index 0000000000..6ed5d25978
--- /dev/null
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BasePushEngine(ClockedObject):
+    type = 'BasePushEngine'
+    cxx_header = "accl/graph/base/base_push_engine.hh"
+    cxx_class = 'gem5::BasePushEngine'
+
+    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 18ac71eb7d..a881fa1e6e 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -28,10 +28,10 @@
 Import('*')
 
 SimObject('Apply.py')
-SimObject('PushEngine.py')
+SimObject('BasePushEngine.py')
 SimObject('WLEngine.py')
 
 Source('apply.cc')
-Source('push_engine.cc')
+Source('base_push_engine.cc')
 Source('wl_engine.cc')
 Source('util.cc')
diff --git a/src/accl/graph/base/push_engine.cc b/src/accl/graph/base/base_push_engine.cc
similarity index 77%
rename from src/accl/graph/base/push_engine.cc
rename to src/accl/graph/base/base_push_engine.cc
index 125433653b..9fbc89221f 100644
--- a/src/accl/graph/base/push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -26,18 +26,15 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/push_engine.hh"
+#include "accl/graph/base/base_push_engine.hh"
 
-#include "accl/util.hh"
+#include "accl/graph/base/util.hh"
 
 namespace gem5
 {
 
-PushEngine::PushEngine(const PushEngineParams &params) : ClockedObject(params),
-    system(params.system),
-    requestorId(system->getRequestorId(this)),
-    reqPort(name() + ".reqPort", this),
-    respPort(name() + ".respPort", this),
+BasePushEngine::BasePushEngine(const BasePushEngine &params) : ClockedObject(params),
+    requestorId(0),
     memPort(name() + ".memPort", this),
     // vertexQueueSize(params.vertex_queue_size),
     // vertexQueueLen(0),
@@ -50,21 +47,29 @@ PushEngine::PushEngine(const PushEngineParams &params) : ClockedObject(params),
 }
 
 Port &
-PushEngine::getPort(const std::string &if_name, PortID idx)
+BasePushEngine::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "reqPort") {
-        return reqPort;
-    } else if (if_name == "respPort") {
-        return respPort;
-    } else if (if_name == "memPort") {
+    if (if_name == "memPort") {
         return memPort;
     } else {
         return SimObject::getPort(if_name, idx);
     }
 }
 
+RequestorID
+BasePushEngine::getRequestorId()
+{
+    return requestorId;
+}
+
+void
+BasePushEngine::setRequestorId(RequestorID requestorId)
+{
+    this->requestorId = requestorId;
+}
+
 void
-PushEngine::startup()
+BasePushEngine::startup()
 {
     //FIXME: This is the current version of our initializer.
     // This should be updated in the future.
@@ -99,75 +104,14 @@ PushEngine::startup()
 
 }
 
-AddrRangeList
-PushEngine::PushRespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-bool
-PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
-{
-    return owner->handleUpdate(pkt);
-}
-
-Tick
-PushEngine::PushRespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-PushEngine::PushRespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-PushEngine::PushRespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-void
-PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-PushEngine::PushReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-PushEngine::PushReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
 bool
-PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt)
+BasePushEngine::MemPort::recvTimingResp(PacketPtr pkt)
 {
     return owner->handleMemResp(pkt);
 }
 
 void
-PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
+BasePushEngine::MemPort::sendPacket(PacketPtr pkt)
 {
     panic_if(_blocked, "Should never try to send if blocked MemSide!");
     // If we can't send the packet across the port, store it for later.
@@ -179,7 +123,7 @@ PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
 }
 
 void
-PushEngine::PushMemPort::recvReqRetry()
+BasePushEngine::MemPort::recvReqRetry()
 {
     panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
 
@@ -191,20 +135,8 @@ PushEngine::PushMemPort::recvReqRetry()
     }
 }
 
-AddrRangeList
-PushEngine::getAddrRanges()
-{
-    return memPort.getAddrRanges();
-}
-
-void
-PushEngine::recvFunctional(PacketPtr pkt)
-{
-    memPort.sendFunctional(pkt);
-}
-
 bool
-PushEngine::handleUpdate(PacketPtr pkt)
+BasePushEngine::handleUpdate(PacketPtr pkt)
 {
     //FIXME: There should be a check if the queues are full.
     // if (vertexQueueLen < vertexQueueSize) {
@@ -223,7 +155,8 @@ PushEngine::handleUpdate(PacketPtr pkt)
     return true;
 }
 
-void PushEngine::processNextReceiveEvent()
+void
+BasePushEngine::processNextReceiveEvent()
 {
     PacketPtr updatePkt = vertexQueue.front();
     uint8_t* data = updatePkt->getPtr<uint8_t>();
@@ -274,7 +207,8 @@ void PushEngine::processNextReceiveEvent()
     }
 }
 
-void PushEngine::processNextReadEvent()
+void
+BasePushEngine::processNextReadEvent()
 {
     PacketPtr pkt = memReqQueue.front();
     if (!memPort.blocked()) {
@@ -288,7 +222,7 @@ void PushEngine::processNextReadEvent()
 }
 
 bool
-PushEngine::handleMemResp(PacketPtr pkt)
+BasePushEngine::handleMemResp(PacketPtr pkt)
 {
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
@@ -321,7 +255,8 @@ PushEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
-void PushEngine::processNextSendEvent()
+void
+BasePushEngine::processNextSendEvent()
 {
     PacketPtr pkt = updateQueue.front();
     if (!reqPort.blocked()) {
diff --git a/src/accl/graph/base/push_engine.hh b/src/accl/graph/base/base_push_engine.hh
similarity index 66%
rename from src/accl/graph/base/push_engine.hh
rename to src/accl/graph/base/base_push_engine.hh
index fbb7d6915a..591f4ab734 100644
--- a/src/accl/graph/base/push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -31,67 +31,27 @@
 
 #include <queue>
 
-#include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
-#include "params/PushEngine.hh"
+#include "params/BasePushEngine.hh"
 #include "sim/clocked_object.hh"
-#include "sim/system.hh"
 
 namespace gem5
 {
 
-class PushEngine : public ClockedObject
+class BasePushEngine : public ClockedObject
 {
   private:
 
-    class PushRespPort : public ResponsePort
+    class MemPort : public RequestPort
     {
       private:
-        PushEngine* owner;
-
-      public:
-        PushRespPort(const std::string& name, PushEngine* owner):
-          ResponsePort(name, owner), owner(owner)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    class PushReqPort : public RequestPort
-    {
-      private:
-        PushEngine* owner;
+        BasePushEngine* owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
       public:
-        PushReqPort(const std::string& name, PushEngine* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    class PushMemPort : public RequestPort
-    {
-      private:
-        PushEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        PushMemPort(const std::string& name, PushEngine* owner):
+        MemPort(const std::string& name, PushEngine* owner):
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
@@ -106,13 +66,9 @@ class PushEngine : public ClockedObject
 
     virtual void startup() override;
 
-    System* const system;
-    const RequestorID requestorId;
+    RequestorID requestorId;
 
-    PushReqPort reqPort;
-    PushRespPort respPort;
-
-    PushMemPort memPort;
+    MemPort memPort;
 
     std::queue<PacketPtr> vertexQueue;
     // int vertexQueueSize;
@@ -128,9 +84,6 @@ class PushEngine : public ClockedObject
     // int updateQueueSize;
     // int updateQueueLen;
 
-    AddrRangeList getAddrRanges();
-    void recvFunctional(PacketPtr pkt);
-
     bool handleUpdate(PacketPtr pkt);
     EventFunctionWrapper nextReceiveEvent;
     void processNextReceiveEvent();
@@ -144,11 +97,14 @@ class PushEngine : public ClockedObject
 
   public:
 
-    PushEngine(const PushEngineParams &params);
+    BasePushEngine(const PushEngineParams &params);
 
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
+    RequestorID getRequestorId();
+    void setRequestorId(RequestorId requestorId);
+
 };
 
 }
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index b6e136dda5..923c1a2f38 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -28,7 +28,7 @@
 from m5.params import *
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
-# FIXME: update these to correct files
+
 from m5.objects.WLEngine import WLEngine
 from m5.objects.PushEngine import PushEngine
 from m5.objects.ApplyEngine import ApplyEngine
diff --git a/src/accl/graph/base/PushEngine.py b/src/accl/graph/sega/PushEngine.py
similarity index 83%
rename from src/accl/graph/base/PushEngine.py
rename to src/accl/graph/sega/PushEngine.py
index 7fef165169..fa9d921a26 100644
--- a/src/accl/graph/base/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -29,9 +29,13 @@
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
-class PushEngine(ClockedObject):
-    type = 'PushEngine'
-    cxx_header = "accl/push_engine.hh"
-    cxx_class = 'gem5::PushEngine'
+from m5.objects.WLEngine import WLEngine
+from m5.objects.PushEngine import PushEngine
+from m5.objects.ApplyEngine import ApplyEngine
 
-    memPort  = RequestPort("Port to communicate with the memory")
+class MPU(ClockedObject):
+    type = 'MPU'
+    cxx_header = "accl/graph/sega/mpu.hh"
+    cxx_class = 'gem5::MPU'
+
+    mpu = Param.MPU(Parent, "The MPU object than owns this PushEngine.")
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
new file mode 100644
index 0000000000..e69de29bb2

From 3f798dfd17a1ec8087fcdd6c904ae1e8777c91c1 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 11:34:28 -0800
Subject: [PATCH 038/247] Adding RequestorID

---
 src/accl/graph/base/base_apply_engine.cc | 13 +++++++++++++
 src/accl/graph/base/base_apply_engine.hh |  3 +++
 src/accl/graph/base/base_wl_engine.cc    | 13 +++++++++++++
 src/accl/graph/base/base_wl_engine.hh    |  4 ++++
 4 files changed, 33 insertions(+)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index c88d14a2c2..111ea16f2e 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -37,6 +37,7 @@ namespace gem5
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     ClockedObject(params),
+    requestorId(0),
     memPort(name() + ".memPort", this),
     applyReadQueue(params.applyQueueSize),
     applyWriteQueue(params.applyQueueSize),
@@ -54,6 +55,18 @@ BaseApplyEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+RequestorID
+BaseApplyEngine::getRequestorId()
+{
+    return requestorId;
+}
+
+void
+BaseApplyEngine::setRequestorId(RequestorID requestorId)
+{
+    this->requestorId = requestorId;
+}
+
 void
 BaseApplyEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
 {
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index c2d2f26387..3304e58a92 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -116,6 +116,9 @@ class BaseApplyEngine : public ClockedObject
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
+
+    RequestorID getRequestorId();
+    void setRequestorId(RequestorId requestorId);
 };
 
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 7261069c17..dec37636ba 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -37,6 +37,7 @@ namespace gem5
 
 BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
     ClockedObject(params),
+    requestorId(0),
     memPort(name() + ".memPort", this),
     updateQueue(params.wlQueueSize),
     responseQueue(params.wlQueueSize),
@@ -54,6 +55,18 @@ BaseWLEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+RequestorID
+BaseWLEngine::getRequestorId()
+{
+    return requestorId;
+}
+
+void
+BaseWLEngine::setRequestorId(RequestorID requestorId)
+{
+    this->requestorId = requestorId;
+}
+
 void
 BaseWLEngine::WLMemPort::sendPacket(PacketPtr pkt)
 {
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 2095a20f1b..a63d9b1ef7 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -97,6 +97,7 @@ class BaseWLEngine : public ClockedObject
         void recvReqRetry() override;
     };
 
+    RequestorID requestorId;
     MemPort memPort;
     WLQueue updateQueue;
     WLQueue responseQueue;
@@ -125,6 +126,9 @@ class BaseWLEngine : public ClockedObject
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
+
+    RequestorID getRequestorId();
+    void setRequestorId(RequestorId requestorId);
 };
 
 }

From d8680eeef1505fb937c7e1ddc8f37681669f46e5 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 20 Feb 2022 13:01:19 -0800
Subject: [PATCH 039/247] Definining MPU interfaces.

---
 src/accl/graph/base/base_push_engine.cc |  35 +----
 src/accl/graph/base/base_push_engine.hh |  24 ----
 src/accl/graph/base/base_wl_engine.hh   |   1 +
 src/accl/graph/sega/mpu.cc              | 183 ++++++++++++++++++++++++
 src/accl/graph/sega/mpu.hh              | 134 +++++++++++++++++
 src/mem/packet.hh                       |   3 +
 6 files changed, 322 insertions(+), 58 deletions(-)
 create mode 100644 src/accl/graph/sega/mpu.cc

diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index 9fbc89221f..c4388cab4b 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -49,11 +49,7 @@ BasePushEngine::BasePushEngine(const BasePushEngine &params) : ClockedObject(par
 Port &
 BasePushEngine::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "memPort") {
-        return memPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
+    return SimObject::getPort(if_name, idx);
 }
 
 RequestorID
@@ -104,36 +100,7 @@ BasePushEngine::startup()
 
 }
 
-bool
-BasePushEngine::MemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
 
-void
-BasePushEngine::MemPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-void
-BasePushEngine::MemPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
 
 bool
 BasePushEngine::handleUpdate(PacketPtr pkt)
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 591f4ab734..2265bb32db 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -42,34 +42,10 @@ namespace gem5
 class BasePushEngine : public ClockedObject
 {
   private:
-
-    class MemPort : public RequestPort
-    {
-      private:
-        BasePushEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        MemPort(const std::string& name, PushEngine* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
     virtual void startup() override;
 
     RequestorID requestorId;
 
-    MemPort memPort;
-
     std::queue<PacketPtr> vertexQueue;
     // int vertexQueueSize;
     // int vertexQueueLen;
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index a63d9b1ef7..3a683bb6e4 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -105,6 +105,7 @@ class BaseWLEngine : public ClockedObject
     std::unordered_map<RequestPtr, int> requestOffset;
 
     //Events
+    //FIXME: make handleWLUpdate public
     bool handleWLUpdate(PacketPtr pkt);
     EventFunctionWrapper nextWLReadEvent;
     void processNextWLReadEvent();
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
new file mode 100644
index 0000000000..c45ad78ef9
--- /dev/null
+++ b/src/accl/graph/sega/mpu.cc
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/mpu.hh"
+
+void
+MPU::startup()
+{
+    if (((int16_t) applyEngine->getRequestorId) == -1) {
+        applyEngine->setRequestorId(nextRequestorId++);
+    }
+    if (((int16_t) pushEngine->getRequestorId) == -1) {
+        pushEngine->setRequestorId(nextRequestorId++);
+    }
+    if (((int16_t) wlEngine->getRequestorId) == -1) {
+        wlEngine->setRequestorId(nextRequestorId++);
+    }
+}
+
+AddrRangeList
+MPU::MPURespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+bool
+MPU::MPURespPort::recvTimingReq(PacketPtr pkt)
+{
+    return wlEngine->handleWLUpdate(pkt);
+}
+
+Tick
+MPU::MPURespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+MPU::MPURespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+MPU::MPURespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+MPU::MPUReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+MPU::MPUReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+MPU::MPUReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+bool
+MPU::MPUMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return owner->handleMemResp(pkt);
+}
+
+void
+MPU::MPUMemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+MPU::MPUMemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+AddrRangeList
+MPU::getAddrRanges()
+{
+    return memPort.getAddrRanges();
+}
+
+void
+MPU::recvFunctional(PacketPtr pkt)
+{
+    if (pkt->isUpdateWL()) {
+        panic("Functional requests should not be made to WL.")
+        //TODO: Might be a good idea to implement later.
+        // wlEngine->recvFunctional(pkt);
+    } else {
+        memPort.recvFuctional(pkt);
+    }
+}
+
+bool
+MPU::handleMemReq(PacketPtr pkt)
+{
+    return memPort.recvTimingReq(pkt);
+}
+
+void
+MPU::handleMemResp(PacketPtr pkt)
+{
+    //TODO: Implement this;
+}
+
+bool
+MPU::recvWLNotif(WorkListItem wl)
+{
+    return applyEngine->recvWLUpdate(wl);
+}
+
+bool
+MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
+{
+    return pushEngine->recvApplyUpdate(prop, degree, edgeIndex);
+}
+
+bool
+MPU::recvPushUpdate(PacketPtr pkt)
+{
+    // TODO: Implement this Mahyar
+}
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index e69de29bb2..bc4ba5d53b 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_MPU_HH__
+#define __ACCL_GRAPH_SEGA_MPU_HH__
+
+#include "accl/graph/base/util.hh"
+#include "accl/graph/sega/apply_engine.hh"
+#include "accl/graph/sega/push_engine.hh"
+#include "accl/graph/sega/wl_engine.hh"
+#include "base/addr_range.hh"
+#include "mem/port.hh"
+#include "mem/packet.hh"
+#include "params/MPU.hh"
+#include "sim/clocked_object.hh"
+
+class MPU : public ClockedObject
+{
+  private:
+    class MPURespPort : public ResponsePort
+    {
+      private:
+        MPU* owner;
+
+      public:
+        MPURespPort(const std::string& name, MPU* owner):
+          ResponsePort(name, owner), owner(owner)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    class MPUReqPort : public RequestPort
+    {
+      private:
+        MPU* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        MPUReqPort(const std::string& name, MPU* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    class MPUMemPort : public RequestPort
+    {
+      private:
+        MPU* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        MemPort(const std::string& name, MPU* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    virtual void startup();
+
+    RequestorID nextRequestorId;
+
+    MPURespPort respPort;
+    MPUReqPort reqPort;
+    MPUMemPort memPort;
+
+    ApplyEngine* applyEngine;
+    PushEngine* pushEngine;
+    WLEngine* wlEngine;
+
+    AddrRangeList getAddrRanges();
+    void recvFunctional(PacketPtr pkt);
+
+    bool handleMemReq(PacketPtr pkt);
+    void handleMemResp(PacketPtr pkt);
+
+    bool recvWLNotif(WorkListItem wl);
+    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+    bool recvPushUpdate(PacketPtr pkt);
+
+  public:
+
+    MPU(const MPUParams &params);
+
+    Port& getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
+}
+
+#endif // __ACCL_GRAPH_SEGA_MPU_HH__
\ No newline at end of file
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 5332ee32a2..44c44d08a6 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -178,6 +178,7 @@ class MemCmd
         IsPrint,        //!< Print state matching address (for debugging)
         IsFlush,        //!< Flush the address from caches
         FromCache,      //!< Request originated from a caching agent
+        UpdateWL,       // MPU Accelerator
         NUM_COMMAND_ATTRIBUTES
     };
 
@@ -267,6 +268,8 @@ class MemCmd
                 cmd == ReadCleanReq || cmd == ReadSharedReq);
     }
 
+    bool isUpdateWL() const     {return testCmdAttrib(updateWL);}
+
     Command
     responseCommand() const
     {

From 1b1bbac7eedbbf1dfc1f8a5d1495227c6a87e789 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 15:46:10 -0800
Subject: [PATCH 040/247] Adding changes to ApplyEngine and WLEngine

---
 src/accl/graph/base/base_apply_engine.hh | 28 ++++--------------------
 src/accl/graph/base/base_wl_engine.hh    | 26 +++++-----------------
 2 files changed, 9 insertions(+), 45 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 3304e58a92..d603cb2713 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -73,31 +73,8 @@ class BaseApplyEngine : public ClockedObject
         {}
     };
 
-    class MemPort : public RequestPort
-    {
-      private:
-        BaseApplyEngine *owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        MemPort(const std::string& name, BaseApplyEngine* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked(){ return _blocked;}
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        void recvReqRetry() override;
-    };
-
     const RequestorID requestorId;
 
-    MemPort memPort;
-
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
 
@@ -106,11 +83,14 @@ class BaseApplyEngine : public ClockedObject
     bool handleWL(PacketPtr pkt);
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
-
+    //FIXME: make void
     bool handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
+  protected:
+    virtual void sendMemReq(PacketPtr pkt) = 0;
+
   public:
     BaseApplyEngine(const ApplyParams &apply);
 
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 3a683bb6e4..0530c64c72 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -77,26 +77,6 @@ class BaseWLEngine : public ClockedObject
         sendPktRetry(false){}
     };
 
-    class MemPort : public RequestPort
-    {
-      private:
-        WLEngine *owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        MemPort(const std::string& name, WLEngine* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        void recvReqRetry() override;
-    };
-
     RequestorID requestorId;
     MemPort memPort;
     WLQueue updateQueue;
@@ -113,6 +93,7 @@ class BaseWLEngine : public ClockedObject
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
+   //FIXME: make void
     bool handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextWLReduceEvent;
     void processNextWLReduceEvent();
@@ -121,8 +102,11 @@ class BaseWLEngine : public ClockedObject
        read + write
        Write edgelist loc in buffer
     */
+  protected:
+    virtual void sendMemReq(PacketPtr pkt) = 0;
+    virtual void sendApplyReq(WorkListItem wl) = 0;
 
-   public:
+  public:
     BaseWLEngine(const BaseWLEngineParams &params);
 
     Port& getPort(const std::string &if_name,

From 64080f26149dd3295e452b1e842e2fef1ef8613c Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 22:39:08 -0800
Subject: [PATCH 041/247] Finished restructured for ApplyE and WLE,
 pre-compiled

---
 src/accl/graph/base/BaseApplyEngine.py   |  9 +--
 src/accl/graph/base/SConscript           |  8 +-
 src/accl/graph/base/base_apply_engine.cc | 94 +++++++++---------------
 src/accl/graph/base/base_apply_engine.hh | 13 ++--
 src/accl/graph/base/base_wl_engine.cc    | 78 +++++---------------
 src/accl/graph/base/base_wl_engine.hh    | 17 ++---
 src/accl/graph/sega/SConscript           | 37 ++++++++++
 src/accl/graph/sega/apply_engine.cc      | 48 ++++++++++++
 src/accl/graph/sega/apply_engine.hh      | 54 ++++++++++++++
 src/accl/graph/sega/wl_engine.cc         | 50 +++++++++++++
 src/accl/graph/sega/wl_engine.hh         | 57 ++++++++++++++
 11 files changed, 321 insertions(+), 144 deletions(-)
 create mode 100644 src/accl/graph/sega/SConscript

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index 80aa430139..23fdfbb08a 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -29,10 +29,9 @@
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
-class Apply(ClockedObject):
-    type = 'Apply'
-    cxx_header = "accl/apply.hh"
-    cxx_class = 'gem5::Apply'
+class BaseApplyEngine(ClockedObject):
+    type = 'BaseApplyEngine'
+    cxx_header = "accl/base_apply_engine.hh"
+    cxx_class = 'gem5::BaseApplyEngine'
 
     applyQueueSize = Param.Unsigned(32, "Size of write queue")
-    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index a881fa1e6e..cc55100064 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,11 +27,11 @@
 
 Import('*')
 
-SimObject('Apply.py')
+SimObject('BaseApplyEngine.py')
 SimObject('BasePushEngine.py')
-SimObject('WLEngine.py')
+SimObject('BaseWLEngine.py')
 
-Source('apply.cc')
+Source('base_apply_engine.cc')
 Source('base_push_engine.cc')
-Source('wl_engine.cc')
+Source('base_wl_engine.cc')
 Source('util.cc')
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 111ea16f2e..805a7649b7 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -37,8 +37,7 @@ namespace gem5
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     ClockedObject(params),
-    requestorId(0),
-    memPort(name() + ".memPort", this),
+    requestorId(-1),
     applyReadQueue(params.applyQueueSize),
     applyWriteQueue(params.applyQueueSize),
     nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
@@ -48,11 +47,7 @@ BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
 Port &
 BaseApplyEngine::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "memPort") {
-        return memPort;
-    } else {
         return SimObject::getPort(if_name, idx);
-    }
 }
 
 RequestorID
@@ -67,29 +62,6 @@ BaseApplyEngine::setRequestorId(RequestorID requestorId)
     this->requestorId = requestorId;
 }
 
-void
-BaseApplyEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
-{
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-BaseApplyEngine::ApplyMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
-
-void
-BaseApplyEngine::ApplyMemPort::recvReqRetry()
-{
-    _blocked = false;
-    sendPacket(blockedPacket);
-    blockedPacket = nullptr;
-}
-
 bool BaseApplyEngine::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
     if (queue.blocked()){
@@ -106,19 +78,19 @@ bool BaseApplyEngine::handleWL(PacketPtr pkt){
 
 void BaseApplyEngine::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
-    if (!memPort.blocked()){
-        PacketPtr pkt = queue.front();
-        if (queue.sendPktRetry && !queue.blocked()){
-                // respPort.trySendRetry();
-                queue.sendPktRetry = false;
-        }
-        // conver to ReadReq
-        Addr req_addr = (pkt->getAddr() / 64) * 64;
-        int req_offset = (pkt->getAddr()) % 64;
-        RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
-        PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
-        requestOffset[request] = req_offset;
-        memPort.sendPacket(memPkt);
+    // if (!memPort.blocked()){
+    PacketPtr pkt = queue.front();
+    // if (queue.sendPktRetry && !queue.blocked()){
+    //         // respPort.trySendRetry();
+    //         queue.sendPktRetry = false;
+    // }
+    // conver to ReadReq
+    Addr req_addr = (pkt->getAddr() / 64) * 64;
+    int req_offset = (pkt->getAddr()) % 64;
+    RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
+    PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
+    requestOffset[request] = req_offset;
+    if (parent.sendMemReq(memPkt)){
         queue.pop();
     }
     if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
@@ -157,26 +129,26 @@ BaseApplyEngine::processNextApplyEvent(){
         uint32_t temp_prop = wl.temp_prop;
 
         if (temp_prop != prop){
-            if (!memPort.blocked() && !reqPort.blocked()){
-                //update prop with temp_prop
-                if(prop < temp_prop){
-                    wl.prop = prop;
-                }else{
-                    wl.prop = temp_prop;
-                }
-                //write back the new worklist item to  memory
-                uint8_t* wList = workListToMemory(wl);
-                memcpy(data + request_offset, wList, sizeof(WorkListItem));
-                //Create memory write requests.
-                PacketPtr writePkt  =
-                getWritePacket(pkt->getAddr(), 64, data, requestorId);
-                memPort.sendPacket(writePkt);
-                reqPort.sendPacket(writePkt);
+            // if (!memPort.blocked() && !reqPort.blocked()){
+            //update prop with temp_prop
+            if(prop < temp_prop){
+                wl.prop = prop;
+            }else{
+                wl.prop = temp_prop;
+            }
+            //write back the new worklist item to  memory
+            uint8_t* wList = workListToMemory(wl);
+            memcpy(data + request_offset, wList, sizeof(WorkListItem));
+            //Create memory write requests.
+            PacketPtr writePkt  =
+            getWritePacket(pkt->getAddr(), 64, data, requestorId);
+            if (parent.sendMemReq(writePkt) &&
+                parent.recvApplyNotif(WorkListItem.prop,
+                                      WorkListItem.degree,
+                                      WorkListItem.edgeIndex)){
                 queue.pop();
-                if (queue.sendPktRetry && !queue.blocked()){
-                    // memPort.trySendRetry();
-                    queue.sendPktRetry = false;
-                }
+                // memPort.trySendRetry();
+                // queue.sendPktRetry = false;
             }
         }else{
             queue.applyQueue.pop();
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index d603cb2713..27d906f060 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_APPLY_HH__
-#define __ACCL_APPLY_HH__
+#ifndef __ACCL_BASEAPPLY_HH__
+#define __ACCL_BASEAPPLY_HH__
 
 #include <queue>
 #include <unordered_map>
@@ -83,13 +83,14 @@ class BaseApplyEngine : public ClockedObject
     bool handleWL(PacketPtr pkt);
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
-    //FIXME: make void
-    bool handleMemResp(PacketPtr resp);
+
+    void handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
   protected:
-    virtual void sendMemReq(PacketPtr pkt) = 0;
+    virtual bool sendMemReq(PacketPtr pkt) = 0;
+    virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
     BaseApplyEngine(const ApplyParams &apply);
@@ -103,4 +104,4 @@ class BaseApplyEngine : public ClockedObject
 
 }
 
-#endif // __ACCL_APPLY_HH__
+#endif // __BASEACCL_APPLY_HH__
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index dec37636ba..4af6f5e326 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -37,8 +37,7 @@ namespace gem5
 
 BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
     ClockedObject(params),
-    requestorId(0),
-    memPort(name() + ".memPort", this),
+    requestorId(-1),
     updateQueue(params.wlQueueSize),
     responseQueue(params.wlQueueSize),
     nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()),
@@ -48,11 +47,7 @@ BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
 Port &
 BaseWLEngine::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "memPort") {
-        return memPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
+    return SimObject::getPort(if_name, idx);
 }
 
 RequestorID
@@ -67,31 +62,6 @@ BaseWLEngine::setRequestorId(RequestorID requestorId)
     this->requestorId = requestorId;
 }
 
-void
-BaseWLEngine::WLMemPort::sendPacket(PacketPtr pkt)
-{
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-void
-BaseWLEngine::WLMemPort::recvReqRetry()
-{
-    // We should have a blocked packet if this function is called.
-    assert(_blocked && blockedPacket != nullptr);
-    _blocked = false;
-    sendPacket(blockedPacket);
-    blockedPacket = nullptr;
-}
-
-bool
-BaseWLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
-
 bool
 BaseWLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
@@ -109,20 +79,16 @@ BaseWLEngine::handleWLUpdate(PacketPtr pkt){
 
 void BaseWLEngine::processNextWLReadEvent(){
     auto queue = updateQueue;
-    while (!queue.empty()){ //create a map instead of front
-        PacketPtr pkt = queue.front();
-        /// conver to ReadReq
-        Addr req_addr = (pkt->getAddr() / 64) * 64;
-        int req_offset = (pkt->getAddr()) % 64;
-        RequestPtr request =
-            std::make_shared<Request>(req_addr, 64, 0 ,0);
-        PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
-        requestOffset[request] = req_offset;
-        if (!memPort.blocked()){
-            queue.pop();
-            memPort.sendPacket(memPkt);
-            break;
-        }
+    PacketPtr pkt = queue.front();
+    /// conver to ReadReq
+    Addr req_addr = (pkt->getAddr() / 64) * 64;
+    int req_offset = (pkt->getAddr()) % 64;
+    RequestPtr request =
+        std::make_shared<Request>(req_addr, 64, 0 ,0);
+    PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
+    requestOffset[request] = req_offset;
+    if (parent.sendMemReq()){
+        queue.pop();
     }
     if(!queue.empty() && !nextWLReadEvent.scheduled()){
         schedule(nextWLReadEvent, nextCycle());
@@ -150,7 +116,6 @@ void
 BaseWLEngine::processNextWLReduceEvent(){
     auto queue = responseQueue;
     auto updateQ = updateQueue;
-    auto applyPort = reqPort;
     PacketPtr update = updateQ.front();
     uint8_t* value = update->getPtr<uint8_t>();
     PacketPtr pkt = queue.front();
@@ -164,17 +129,16 @@ BaseWLEngine::processNextWLReduceEvent(){
         if(*value < temp_prop){
             temp_prop = *value;
         }
-        if (!memPort.blocked() && !applyPort.blocked()){
-            wl.temp_prop = temp_prop;
-            uint8_t* wlItem = workListToMemory(wl);
-            memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
-            PacketPtr writePkt  =
-            getWritePacket(pkt->getAddr(), 64, data, requestorId);
-            memPort.sendPacket(writePkt);
-            applyPort.sendPacket(writePkt);
+        // if (!memPort.blocked() && !applyPort.blocked()){
+        wl.temp_prop = temp_prop;
+        uint8_t* wlItem = workListToMemory(wl);
+        memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
+        PacketPtr writePkt  =
+        getWritePacket(pkt->getAddr(), 64, data, requestorId);
+        if (parent.sendMemReq(writePkt) &&
+            parent.sendWLNotif(writePkt)) {
             queue.pop();
             if (!queue.blocked() && queue.sendPktRetry){
-                // memPort.trySendRetry();
                 queue.sendPktRetry = false;
             }
             updateQ.pop();
@@ -187,12 +151,10 @@ BaseWLEngine::processNextWLReduceEvent(){
     else{
         queue.pop();
         if (!queue.blocked() && queue.sendPktRetry){
-            // memPort.trySendRetry();
             queue.sendPktRetry = false;
         }
         updateQ.pop();
         if (!updateQ.blocked() & updateQ.sendPktRetry){
-            // respPort.trySendRetry();
             updateQ.sendPktRetry = false;
         }
 
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 0530c64c72..1d0f3e33c1 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_WLE_HH__
-#define __ACCL_WLE_HH__
+#ifndef __ACCL_BASEWLENGINE_HH__
+#define __ACCL_BASEWLENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
@@ -78,23 +78,19 @@ class BaseWLEngine : public ClockedObject
     };
 
     RequestorID requestorId;
-    MemPort memPort;
     WLQueue updateQueue;
     WLQueue responseQueue;
 
     std::unordered_map<RequestPtr, int> requestOffset;
 
     //Events
-    //FIXME: make handleWLUpdate public
-    bool handleWLUpdate(PacketPtr pkt);
     EventFunctionWrapper nextWLReadEvent;
     void processNextWLReadEvent();
     /* Syncronously checked
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
-   //FIXME: make void
-    bool handleMemResp(PacketPtr resp);
+    void handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextWLReduceEvent;
     void processNextWLReduceEvent();
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
@@ -103,8 +99,8 @@ class BaseWLEngine : public ClockedObject
        Write edgelist loc in buffer
     */
   protected:
-    virtual void sendMemReq(PacketPtr pkt) = 0;
-    virtual void sendApplyReq(WorkListItem wl) = 0;
+    virtual bool sendMemReq(PacketPtr pkt) = 0;
+    virtual bool sendWLNotif(WorkListItem wl) = 0;
 
   public:
     BaseWLEngine(const BaseWLEngineParams &params);
@@ -114,8 +110,9 @@ class BaseWLEngine : public ClockedObject
 
     RequestorID getRequestorId();
     void setRequestorId(RequestorId requestorId);
+    bool handleWLUpdate(PacketPtr pkt);
 };
 
 }
 
-#endif // __ACCL_WLE_HH__
+#endif // __ACCL_BASEWLENGINE_HH__
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
new file mode 100644
index 0000000000..79afe3b7d0
--- /dev/null
+++ b/src/accl/graph/sega/SConscript
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import('*')
+
+SimObject('ApplyEngine.py')
+SimObject('MPU.py')
+SimObject('WLEngine.py')
+
+Source('apply_engine.cc')
+Source('mpu.cc')
+Source('push_engine.cc')
+Source('wl_engine.cc')
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index e69de29bb2..41a568bd27 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/apply_engine.hh"
+
+namespace gem5{
+
+ApplyEngine:ApplyEngine(const BaseApplyEngine &params):
+    BaseApplyEngine(params)
+{}
+
+virtual bool
+ApplyEngine::sendMemReq(PacketPtr pkt){
+    return mpu->handleMemReq(pkt);
+}
+
+virtual bool
+ApplyEngine::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){
+    mpu->recvApplyNotif(prop, degree, edgeIndex);
+
+}
+
+}
\ No newline at end of file
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index e69de29bb2..fd2bca008f 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_APPLY_HH__
+#define __ACCL_APPLY_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "accl/graph/base/base_apply_engine.hh"
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "params/ApplyEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/port.hh"
+
+namespace gem5
+{
+
+class ApplyEngine : public BaseApplyEngine
+{
+  private:
+    MPU mpu;
+  protected:
+    virtual bool sendMemReq(PacketPtr pkt);
+    virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+  public:
+    ApplyEngine(const ApplyEngineParams &params);
+}
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index e69de29bb2..9608d0cbc4 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/wl_engine.hh"
+
+#include <string>
+
+namespace gem5
+{
+
+WLEngine::WLEngine(const WLEngineParams &params):
+    BaseWLEngine(params)
+{}
+
+virtual bool
+WLEngine::sendMemReq(PacketPtr pkt){
+    return mpu->handleMemReq(pkt);
+}
+
+// FIXME: handle the case where Apply queue is full
+virtual bool
+WLEngine::sendWLNotif(WorkListItem wl){
+    mpu->recvWLNotif(wl);
+    return true;
+}
\ No newline at end of file
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index e69de29bb2..eee6b1f22f 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_WLENGINE_HH__
+#define __ACCL_WLENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "accl/graph/base/base_wl_engine.hh"
+#include "base/addr_range.hh"
+#include "mem/port.hh"
+#include "mem/packet.hh"
+#include "params/WLEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/port.hh"
+#include "sim/system.hh"
+
+
+namespace gem5
+{
+
+class WLEngine : public BaseWorkListEngine
+{
+  private:
+    MPU* mpu;
+  protected:
+    virtual bool sendMemReq(PacketPtr pkt);
+    virtual bool sendWLNotif(WorkListItem wl);
+  public:
+    WLEngine(const WLEngineParams &params);
+}
\ No newline at end of file

From c6ce909250341eed9d6fe814c45eb402dad0d3b7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 20 Feb 2022 23:31:49 -0800
Subject: [PATCH 042/247] Finished restructure for PushEngine. Pre-compile.

---
 src/accl/graph/base/base_push_engine.cc | 30 +++++--------
 src/accl/graph/base/base_push_engine.hh | 19 +++++++-
 src/accl/graph/sega/mpu.cc              | 29 +++++++++++--
 src/accl/graph/sega/mpu.hh              |  2 +-
 src/accl/graph/sega/push_engine.cc      | 58 +++++++++++++++++++++++++
 src/accl/graph/sega/push_engine.hh      | 55 +++++++++++++++++++++++
 6 files changed, 169 insertions(+), 24 deletions(-)

diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index c4388cab4b..6871154276 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -33,7 +33,8 @@
 namespace gem5
 {
 
-BasePushEngine::BasePushEngine(const BasePushEngine &params) : ClockedObject(params),
+BasePushEngine::BasePushEngine(const BasePushEngine &params) :
+    ClockedObject(params),
     requestorId(0),
     memPort(name() + ".memPort", this),
     // vertexQueueSize(params.vertex_queue_size),
@@ -103,7 +104,8 @@ BasePushEngine::startup()
 
 
 bool
-BasePushEngine::handleUpdate(PacketPtr pkt)
+BasePushEngine::recvApplyNotif(uint32_t prop,
+        uint32_t degree, uint32_t edge_index)
 {
     //FIXME: There should be a check if the queues are full.
     // if (vertexQueueLen < vertexQueueSize) {
@@ -115,7 +117,7 @@ BasePushEngine::handleUpdate(PacketPtr pkt)
     //     return true;
     // }
     // return false;
-    vertexQueue.push(pkt);
+    notifQueue.emplace(prop, degree, edge_index);
     if (!nextReceiveEvent.scheduled()) {
         schedule(nextReceiveEvent, nextCycle());
     }
@@ -125,21 +127,15 @@ BasePushEngine::handleUpdate(PacketPtr pkt)
 void
 BasePushEngine::processNextReceiveEvent()
 {
-    PacketPtr updatePkt = vertexQueue.front();
-    uint8_t* data = updatePkt->getPtr<uint8_t>();
-
-    // data: (edge_index: 32 bits, degree: 32 bits, value: 32 bits)
-    uint32_t edge_index = *((uint32_t *)data);
-    uint32_t degree = *((uint32_t *)(data + 4));
-    uint32_t value = *((uint32_t *)(data + 8));
+    ApplyNotif notif = notifQueue.front();
 
     std::vector<Addr> addr_queue;
     std::vector<Addr> offset_queue;
     std::vector<int> num_edge_queue;
 
-    for (uint32_t index = 0; index < degree; index++) {
+    for (uint32_t index = 0; index < notif.degree; index++) {
         // FIXME: For now the base edge address is 1048576
-        Addr edge_addr = 1048576 + (edge_index + index) * sizeof(Edge);
+        Addr edge_addr = 1048576 + (notif.edge_index + index) * sizeof(Edge);
         Addr req_addr = (edge_addr / 64) * 64;
         Addr req_offset = edge_addr % 64;
         if (addr_queue.size()) {
@@ -164,10 +160,10 @@ BasePushEngine::processNextReceiveEvent()
         memReqQueue.push(pkt);
         reqOffsetMap[pkt->req] = offset_queue[index];
         reqNumEdgeMap[pkt->req] = num_edge_queue[index];
-        reqValueMap[pkt->req] = value;
+        reqValueMap[pkt->req] = notif.prop;
     }
 
-    vertexQueue.pop();
+    notifQueue.pop();
 
     if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
         schedule(nextReadEvent, nextCycle());
@@ -178,8 +174,7 @@ void
 BasePushEngine::processNextReadEvent()
 {
     PacketPtr pkt = memReqQueue.front();
-    if (!memPort.blocked()) {
-        memPort.sendPacket(pkt);
+    if (!sendMemReq(pkt)) {
         memReqQueue.pop();
     }
 
@@ -226,8 +221,7 @@ void
 BasePushEngine::processNextSendEvent()
 {
     PacketPtr pkt = updateQueue.front();
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(pkt);
+    if (!sendPushUpdate(pkt)) {
         updateQueue.pop();
     }
 
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 2265bb32db..63ad3a6652 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -42,11 +42,22 @@ namespace gem5
 class BasePushEngine : public ClockedObject
 {
   private:
+
+    struct ApplyNotif {
+        uint32_t prop;
+        uint32_t degree;
+        uint32_t edgeIndex;
+
+        ApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index):
+        prop(prop), degree(degree), edgeIndex(edge_index)
+        {}
+    };
+
     virtual void startup() override;
 
     RequestorID requestorId;
 
-    std::queue<PacketPtr> vertexQueue;
+    std::queue<ApplyNotif> notifQueue;
     // int vertexQueueSize;
     // int vertexQueueLen;
 
@@ -60,7 +71,7 @@ class BasePushEngine : public ClockedObject
     // int updateQueueSize;
     // int updateQueueLen;
 
-    bool handleUpdate(PacketPtr pkt);
+    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
     EventFunctionWrapper nextReceiveEvent;
     void processNextReceiveEvent();
 
@@ -71,6 +82,10 @@ class BasePushEngine : public ClockedObject
     EventFunctionWrapper nextSendEvent;
     void processNextSendEvent();
 
+  protected:
+    virtual bool sendMemRequest(PacketPtr pkt) = 0;
+    virtual bool sendPushUpdate(PacketPtr pkt) = 0;
+
   public:
 
     BasePushEngine(const PushEngineParams &params);
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index c45ad78ef9..09ab23a835 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -161,7 +161,16 @@ MPU::handleMemReq(PacketPtr pkt)
 void
 MPU::handleMemResp(PacketPtr pkt)
 {
-    //TODO: Implement this;
+    RequestorID requestorId = pkt->requestorId();
+    if (applyEngine->getRequestorId() == requestorId) {
+        applyEngine->handleMemResp(pkt);
+    } else if (pushEngine->getRequestorId() == requestorId) {
+        pushEngine->handleMemResp(pkt);
+    } else if (wlEngine->getRequestorId() == requestorId) {
+        wlEngine->handleMemResp(pkt);
+    } else {
+        panic("Received a response with an unknown requestorId.");
+    }
 }
 
 bool
@@ -173,11 +182,25 @@ MPU::recvWLNotif(WorkListItem wl)
 bool
 MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
 {
-    return pushEngine->recvApplyUpdate(prop, degree, edgeIndex);
+    return pushEngine->recvApplyUpdate(prop, degree, edge_index);
 }
 
 bool
 MPU::recvPushUpdate(PacketPtr pkt)
 {
-    // TODO: Implement this Mahyar
+    Addr addr = pkt->getAddr();
+    for (auto addr_range: memPort.getAddrRangeList()) {
+        if (addr_range.contains(addr)) {
+            if (!memPort.sendPacket(pkt)) {
+                return false;
+            }
+            return true;
+        }
+    }
+
+    if (!reqPort.sendPacket(pkt)) {
+        return false;
+    }
+    return true;
+
 }
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index bc4ba5d53b..93d1dd8bb3 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -120,7 +120,7 @@ class MPU : public ClockedObject
     void handleMemResp(PacketPtr pkt);
 
     bool recvWLNotif(WorkListItem wl);
-    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
     bool recvPushUpdate(PacketPtr pkt);
 
   public:
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index e69de29bb2..e43512c6f4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/push_engine.hh"
+
+namespace gem5
+{
+
+PushEngine::PushEngine(const PushEngine &params) :
+    BasePushEngine(params),
+    owner(params.mpu)
+{
+}
+
+Port &
+PushEngine::getPort(const std::string &if_name, PortID idx)
+{
+    return SimObject::getPort(if_name, idx);
+}
+
+bool
+PushEngine::sendMemReq(PacketPtr)
+{
+    return owner->handleMemReq(pkt);
+}
+
+bool
+PushEngine::sendPushUpdate(PacketPtr pkt)
+{
+    return owner->recvPushUpdate(pkt);
+}
+
+}
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index e69de29bb2..54ef72d5f9 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+
+#include "accl/graph/base/base_push_engine.hh"
+
+namespace gem5
+{
+class PushEngine : public BasePushEngine
+{
+  private:
+    MPU* owner;
+
+  protected:
+    virtual bool sendMemRequest(PacketPtr pkt);
+    virtual bool sendPushUpdate(PacketPtr pkt);
+
+  public:
+    PushEngine(const PushEngineParams &params);
+
+    Port& getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
+
+}
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
\ No newline at end of file

From 8a2dae86375bd48db32f494343df2fc9d5d35816 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 23:51:02 -0800
Subject: [PATCH 043/247] Debugging.

---
 src/accl/graph/base/base_apply_engine.cc | 31 +++++++++---------------
 src/accl/graph/base/base_apply_engine.hh | 13 +++++-----
 src/accl/graph/base/base_push_engine.hh  |  9 ++++---
 src/accl/graph/base/base_wl_engine.cc    |  6 ++---
 src/accl/graph/base/base_wl_engine.hh    |  9 ++++---
 src/accl/graph/base/util.cc              |  2 +-
 src/accl/graph/sega/ApplyEngine.py       |  7 ++----
 src/accl/graph/sega/MPU.py               |  6 ++---
 src/accl/graph/sega/PushEngine.py        | 16 +++++-------
 src/accl/graph/sega/SConscript           |  1 +
 src/accl/graph/sega/WLEngine.py          |  7 ++----
 src/accl/graph/sega/apply_engine.cc      |  6 ++---
 src/accl/graph/sega/apply_engine.hh      | 10 +++++---
 src/accl/graph/sega/mpu.cc               | 11 ++++++---
 src/accl/graph/sega/mpu.hh               |  5 ++++
 src/accl/graph/sega/wl_engine.hh         |  9 ++++---
 16 files changed, 75 insertions(+), 73 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 805a7649b7..301f5931bf 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/base_apply_engine.hh"
+#include "accl/graph/base/base_apply_engine.hh"
 
 #include <string>
 
@@ -90,7 +90,7 @@ void BaseApplyEngine::processNextApplyCheckEvent(){
     RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
-    if (parent.sendMemReq(memPkt)){
+    if (sendMemReq(memPkt)){
         queue.pop();
     }
     if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
@@ -98,22 +98,13 @@ void BaseApplyEngine::processNextApplyCheckEvent(){
     }
 }
 
-bool
+void
 BaseApplyEngine::handleMemResp(PacketPtr pkt)
 {
-    auto queue = applyWriteQueue;
-
-        if (queue.blocked()){
-            queue.sendPktRetry = true;
-            return false;
-        } else
-            queue.push(pkt);
-
-        if(!nextApplyEvent.scheduled()){
-            schedule(nextApplyEvent, nextCycle());
-        }
-        return true;
-    return true;
+    // FIXME: change the event, remove the retry parts
+    if(!nextApplyEvent.scheduled()){
+        schedule(nextApplyEvent, nextCycle());
+    }
 }
 
 void
@@ -142,10 +133,10 @@ BaseApplyEngine::processNextApplyEvent(){
             //Create memory write requests.
             PacketPtr writePkt  =
             getWritePacket(pkt->getAddr(), 64, data, requestorId);
-            if (parent.sendMemReq(writePkt) &&
-                parent.recvApplyNotif(WorkListItem.prop,
-                                      WorkListItem.degree,
-                                      WorkListItem.edgeIndex)){
+            if (sendMemReq(writePkt) &&
+                recvApplyNotif(wl.prop,
+                                wl.degree,
+                                wl.edgeIndex)){
                 queue.pop();
                 // memPort.trySendRetry();
                 // queue.sendPktRetry = false;
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 27d906f060..56b43cfb7b 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -26,14 +26,15 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_BASEAPPLY_HH__
-#define __ACCL_BASEAPPLY_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
 
 #include "mem/packet.hh"
 #include "mem/port.hh"
+#include "mem/request.hh"
 #include "params/BaseApplyEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
@@ -73,7 +74,7 @@ class BaseApplyEngine : public ClockedObject
         {}
     };
 
-    const RequestorID requestorId;
+    RequestorID requestorId;
 
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
@@ -93,15 +94,15 @@ class BaseApplyEngine : public ClockedObject
     virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
-    BaseApplyEngine(const ApplyParams &apply);
+    BaseApplyEngine(const BaseApplyEngineParams &apply);
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 
     RequestorID getRequestorId();
-    void setRequestorId(RequestorId requestorId);
+    void setRequestorId(RequestorID requestorId);
 };
 
 }
 
-#endif // __BASEACCL_APPLY_HH__
+#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 63ad3a6652..873cb26b3d 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -26,12 +26,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_PUSH_ENGINE_HH__
-#define __ACCL_PUSH_ENGINE_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
 
 #include <queue>
 
 #include "mem/port.hh"
+#include "mem/request.hh"
 #include "mem/packet.hh"
 #include "params/BasePushEngine.hh"
 #include "sim/clocked_object.hh"
@@ -94,10 +95,10 @@ class BasePushEngine : public ClockedObject
                 PortID idx=InvalidPortID) override;
 
     RequestorID getRequestorId();
-    void setRequestorId(RequestorId requestorId);
+    void setRequestorId(RequestorID requestorId);
 
 };
 
 }
 
-#endif // __ACCL_PUSH_ENGINE_HH__
+#endif // __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 4af6f5e326..b863b38e19 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -87,7 +87,7 @@ void BaseWLEngine::processNextWLReadEvent(){
         std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
-    if (parent.sendMemReq()){
+    if (sendMemReq()){
         queue.pop();
     }
     if(!queue.empty() && !nextWLReadEvent.scheduled()){
@@ -135,8 +135,8 @@ BaseWLEngine::processNextWLReduceEvent(){
         memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
         PacketPtr writePkt  =
         getWritePacket(pkt->getAddr(), 64, data, requestorId);
-        if (parent.sendMemReq(writePkt) &&
-            parent.sendWLNotif(writePkt)) {
+        if (sendMemReq(writePkt) &&
+            sendWLNotif(writePkt)) {
             queue.pop();
             if (!queue.blocked() && queue.sendPktRetry){
                 queue.sendPktRetry = false;
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 1d0f3e33c1..3d807d8b06 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -26,12 +26,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_BASEWLENGINE_HH__
-#define __ACCL_BASEWLENGINE_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
 
+#include "accl/graph/base/util.hh"
 #include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
@@ -109,10 +110,10 @@ class BaseWLEngine : public ClockedObject
                   PortID idx=InvalidPortID) override;
 
     RequestorID getRequestorId();
-    void setRequestorId(RequestorId requestorId);
+    void setRequestorId(RequestorID requestorId);
     bool handleWLUpdate(PacketPtr pkt);
 };
 
 }
 
-#endif // __ACCL_BASEWLENGINE_HH__
+#endif // __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
diff --git a/src/accl/graph/base/util.cc b/src/accl/graph/base/util.cc
index 40a1fc761b..0baa374714 100644
--- a/src/accl/graph/base/util.cc
+++ b/src/accl/graph/base/util.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/util.hh"
+#include "accl/graph/base/util.hh"
 
 namespace gem5
 {
diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/ApplyEngine.py
index 0d03e71e54..bb43836ff7 100644
--- a/src/accl/graph/sega/ApplyEngine.py
+++ b/src/accl/graph/sega/ApplyEngine.py
@@ -25,16 +25,13 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from build.NULL.python.m5.proxy import Parent
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
-# FIXME: update these to correct files
 from m5.objects.BaseApplyEngine import BaseApplyEngine
 
 class ApplyEngine(BaseApplyEngine):
     type = 'ApplyEngine'
     cxx_header = "accl/graph/sega/apply_engine.hh"
-    cxx_class = 'gem5::MPU'
+    cxx_class = 'gem5::ApplyEngine'
 
-    mpu = Param.MPU(Parent, "MPU object that owns this ApplyEngine")
+    mpu = Param.MPU(Parent.any, "MPU object that owns this ApplyEngine")
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 923c1a2f38..046dfaf4e8 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -29,9 +29,9 @@
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
-from m5.objects.WLEngine import WLEngine
-from m5.objects.PushEngine import PushEngine
-from m5.objects.ApplyEngine import ApplyEngine
+# from m5.objects.WLEngine import WLEngine
+# from m5.objects.PushEngine import PushEngine
+# from m5.objects.ApplyEngine import ApplyEngine
 
 class MPU(ClockedObject):
     type = 'MPU'
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index fa9d921a26..eb0eed18ab 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -27,15 +27,11 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
+from m5.objects.BasePushEngine import BasePushEngine
 
-from m5.objects.WLEngine import WLEngine
-from m5.objects.PushEngine import PushEngine
-from m5.objects.ApplyEngine import ApplyEngine
+class PushEngine(BasePushEngine):
+    type = 'PushEngine'
+    cxx_header = "accl/graph/sega/push_engine.hh"
+    cxx_class = 'gem5::PushEngine'
 
-class MPU(ClockedObject):
-    type = 'MPU'
-    cxx_header = "accl/graph/sega/mpu.hh"
-    cxx_class = 'gem5::MPU'
-
-    mpu = Param.MPU(Parent, "The MPU object than owns this PushEngine.")
+    mpu = Param.MPU(Parent.any, "MPU object that owns this PushEngine")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 79afe3b7d0..dc19ece06b 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -29,6 +29,7 @@ Import('*')
 
 SimObject('ApplyEngine.py')
 SimObject('MPU.py')
+SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
 Source('apply_engine.cc')
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index a8f3bd20ea..12fbcf9b4f 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -25,16 +25,13 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from build.NULL.python.m5.proxy import Parent
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
-# FIXME: update these to correct files
 from m5.objects.BaseWLEngine import BaseWLEngine
 
 class WLEngine(BaseWLEngine):
     type = 'WLEngine'
     cxx_header = "accl/graph/sega/wl_engine.hh"
-    cxx_class = 'gem5::MPU'
+    cxx_class = 'gem5::WLEngine'
 
-    mpu = Param.MPU(Parent, "MPU object that owns this WLEngine")
\ No newline at end of file
+    mpu = Param.MPU(Parent.any, "MPU object that owns this WLEngine")
\ No newline at end of file
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index 41a568bd27..64ae71e290 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -30,16 +30,16 @@
 
 namespace gem5{
 
-ApplyEngine:ApplyEngine(const BaseApplyEngine &params):
+ApplyEngine::ApplyEngine(const BaseApplyEngine &params):
     BaseApplyEngine(params)
 {}
 
-virtual bool
+bool
 ApplyEngine::sendMemReq(PacketPtr pkt){
     return mpu->handleMemReq(pkt);
 }
 
-virtual bool
+bool
 ApplyEngine::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){
     mpu->recvApplyNotif(prop, degree, edgeIndex);
 
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index fd2bca008f..855ebbd8b0 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_APPLY_HH__
-#define __ACCL_APPLY_HH__
+#ifndef __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
@@ -45,10 +45,14 @@ namespace gem5
 class ApplyEngine : public BaseApplyEngine
 {
   private:
-    MPU mpu;
+    MPU* mpu;
   protected:
     virtual bool sendMemReq(PacketPtr pkt);
     virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
   public:
     ApplyEngine(const ApplyEngineParams &params);
+};
+
 }
+
+#endif // __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 09ab23a835..27f7c8e314 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -28,16 +28,19 @@
 
 #include "accl/graph/sega/mpu.hh"
 
+namespace gem5
+{
+
 void
 MPU::startup()
 {
-    if (((int16_t) applyEngine->getRequestorId) == -1) {
+    if (((int16_t) applyEngine->getRequestorId()) == -1) {
         applyEngine->setRequestorId(nextRequestorId++);
     }
-    if (((int16_t) pushEngine->getRequestorId) == -1) {
+    if (((int16_t) pushEngine->getRequestorId()) == -1) {
         pushEngine->setRequestorId(nextRequestorId++);
     }
-    if (((int16_t) wlEngine->getRequestorId) == -1) {
+    if (((int16_t) wlEngine->getRequestorId()) == -1) {
         wlEngine->setRequestorId(nextRequestorId++);
     }
 }
@@ -204,3 +207,5 @@ MPU::recvPushUpdate(PacketPtr pkt)
     return true;
 
 }
+
+}
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 93d1dd8bb3..b37821c200 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -39,6 +39,9 @@
 #include "params/MPU.hh"
 #include "sim/clocked_object.hh"
 
+namespace gem5
+{
+
 class MPU : public ClockedObject
 {
   private:
@@ -129,6 +132,8 @@ class MPU : public ClockedObject
 
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
+};
+
 }
 
 #endif // __ACCL_GRAPH_SEGA_MPU_HH__
\ No newline at end of file
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index eee6b1f22f..938128e05f 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_WLENGINE_HH__
-#define __ACCL_WLENGINE_HH__
+#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
@@ -54,4 +54,7 @@ class WLEngine : public BaseWorkListEngine
     virtual bool sendWLNotif(WorkListItem wl);
   public:
     WLEngine(const WLEngineParams &params);
-}
\ No newline at end of file
+};
+
+}
+#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
\ No newline at end of file

From c57c564598e55741ed4c33194e7e0c2750efe9c1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 21 Feb 2022 14:40:15 -0800
Subject: [PATCH 044/247] Lots of debugging.

---
 src/accl/graph/base/BaseApplyEngine.py   |   2 +-
 src/accl/graph/base/BasePushEngine.py    |   1 -
 src/accl/graph/base/BaseWLEngine.py      |   4 +-
 src/accl/graph/base/base_apply_engine.cc | 105 ++++++++++------------
 src/accl/graph/base/base_apply_engine.hh |  40 ++-------
 src/accl/graph/base/base_push_engine.cc  |  45 +---------
 src/accl/graph/base/base_push_engine.hh  |  10 +--
 src/accl/graph/base/base_wl_engine.cc    |   6 +-
 src/accl/graph/base/base_wl_engine.hh    |   6 +-
 src/accl/graph/sega/MPU.py               |   6 +-
 src/accl/graph/sega/apply_engine.cc      |  10 ++-
 src/accl/graph/sega/apply_engine.hh      |   9 +-
 src/accl/graph/sega/mpu.cc               | 107 +++++++++++++++++++----
 src/accl/graph/sega/mpu.hh               |  20 ++---
 src/accl/graph/sega/push_engine.cc       |  11 +--
 src/accl/graph/sega/push_engine.hh       |  12 ++-
 src/accl/graph/sega/wl_engine.cc         |  19 ++--
 src/accl/graph/sega/wl_engine.hh         |  13 ++-
 src/mem/packet.hh                        |   3 -
 19 files changed, 217 insertions(+), 212 deletions(-)

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index 23fdfbb08a..45d94b3fd2 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -31,7 +31,7 @@
 
 class BaseApplyEngine(ClockedObject):
     type = 'BaseApplyEngine'
-    cxx_header = "accl/base_apply_engine.hh"
+    cxx_header = "accl/graph/base/base_apply_engine.hh"
     cxx_class = 'gem5::BaseApplyEngine'
 
     applyQueueSize = Param.Unsigned(32, "Size of write queue")
diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
index 6ed5d25978..891221c06d 100644
--- a/src/accl/graph/base/BasePushEngine.py
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -34,4 +34,3 @@ class BasePushEngine(ClockedObject):
     cxx_header = "accl/graph/base/base_push_engine.hh"
     cxx_class = 'gem5::BasePushEngine'
 
-    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
index 7384e876ef..3ecf030138 100644
--- a/src/accl/graph/base/BaseWLEngine.py
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -31,8 +31,8 @@
 
 class BaseWLEngine(ClockedObject):
     type = 'BaseWLEngine'
-    cxx_header = "accl/base_wl_engine.hh"
+    cxx_header = "accl/graph/base/base_wl_engine.hh"
     cxx_class = 'gem5::BaseWLEngine'
 
     wlQueueSize = Param.Unsigned(32, "Size of write queue")
-    memPort  = RequestPort("Memory side port, sends requests")
+
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 301f5931bf..731cd5c345 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -38,8 +38,7 @@ namespace gem5
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     ClockedObject(params),
     requestorId(-1),
-    applyReadQueue(params.applyQueueSize),
-    applyWriteQueue(params.applyQueueSize),
+    queueSize(params.applyQueueSize),
     nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
     nextApplyEvent([this]{ processNextApplyEvent(); }, name())
 {}
@@ -62,14 +61,14 @@ BaseApplyEngine::setRequestorId(RequestorID requestorId)
     this->requestorId = requestorId;
 }
 
-bool BaseApplyEngine::handleWL(PacketPtr pkt){
-    auto queue = applyReadQueue;
-    if (queue.blocked()){
-        queue.sendPktRetry = true;
-        return false;
-    } else{
-        queue.push(pkt);
-    }
+bool BaseApplyEngine::recvWLNotif(Addr addr){
+    // TODO: Investigate the situation where the queue is full.
+    // if (applyReadQueue.size() == queueSize){
+    //     //  applyReadQueue.sendPktRetry = true;
+    //     return true;
+    // } else{
+    applyReadQueue.push(addr);
+    // }
     if (!nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
     }
@@ -77,78 +76,64 @@ bool BaseApplyEngine::handleWL(PacketPtr pkt){
 }
 
 void BaseApplyEngine::processNextApplyCheckEvent(){
-    auto queue = applyReadQueue;
-    // if (!memPort.blocked()){
-    PacketPtr pkt = queue.front();
-    // if (queue.sendPktRetry && !queue.blocked()){
-    //         // respPort.trySendRetry();
-    //         queue.sendPktRetry = false;
-    // }
-    // conver to ReadReq
-    Addr req_addr = (pkt->getAddr() / 64) * 64;
-    int req_offset = (pkt->getAddr()) % 64;
+    Addr addr = applyReadQueue.front();
+    Addr req_addr = (addr / 64) * 64;
+    int req_offset = (addr % 64);
     RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
     if (sendMemReq(memPkt)){
-        queue.pop();
+        applyReadQueue.pop();
     }
-    if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
+    if (!applyReadQueue.empty() &&  !nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
     }
 }
 
-void
+bool
 BaseApplyEngine::handleMemResp(PacketPtr pkt)
 {
     // FIXME: change the event, remove the retry parts
+    applyWriteQueue.push(pkt);
     if(!nextApplyEvent.scheduled()){
         schedule(nextApplyEvent, nextCycle());
     }
+    return true;
 }
 
 void
 BaseApplyEngine::processNextApplyEvent(){
-    auto queue = applyWriteQueue;
-        PacketPtr pkt = queue.front();
-        uint8_t* data = pkt->getPtr<uint8_t>();
+    PacketPtr pkt = applyWriteQueue.front();
+    uint8_t* data = pkt->getPtr<uint8_t>();
 
-        RequestPtr request = pkt->req;
-        int request_offset = requestOffset[request];
-        WorkListItem wl = memoryToWorkList(data + request_offset);
-        uint32_t prop = wl.prop;
-        uint32_t temp_prop = wl.temp_prop;
+    RequestPtr request = pkt->req;
+    int request_offset = requestOffset[request];
+    WorkListItem wl = memoryToWorkList(data + request_offset);
+    uint32_t prop = wl.prop;
+    uint32_t temp_prop = wl.temp_prop;
 
-        if (temp_prop != prop){
-            // if (!memPort.blocked() && !reqPort.blocked()){
-            //update prop with temp_prop
-            if(prop < temp_prop){
-                wl.prop = prop;
-            }else{
-                wl.prop = temp_prop;
-            }
-            //write back the new worklist item to  memory
-            uint8_t* wList = workListToMemory(wl);
-            memcpy(data + request_offset, wList, sizeof(WorkListItem));
-            //Create memory write requests.
-            PacketPtr writePkt  =
-            getWritePacket(pkt->getAddr(), 64, data, requestorId);
-            if (sendMemReq(writePkt) &&
-                recvApplyNotif(wl.prop,
-                                wl.degree,
-                                wl.edgeIndex)){
-                queue.pop();
-                // memPort.trySendRetry();
-                // queue.sendPktRetry = false;
-            }
-        }else{
-            queue.applyQueue.pop();
-            if (queue.sendPktRetry && !queue.blocked()){
-                // memPort.trySendRetry();
-                queue.sendPktRetry = false;
-            }
+    if (temp_prop != prop) {
+        // TODO: instead of min add a Reduce function.
+        //update prop with temp_prop
+        if(prop < temp_prop) {
+            wl.prop = prop;
+        }else {
+            wl.prop = temp_prop;
+        }
+        //write back the new worklist item to  memory
+        uint8_t* wList = workListToMemory(wl);
+        memcpy(data + request_offset, wList, sizeof(WorkListItem));
+        //Create memory write requests.
+        PacketPtr writePkt  =
+        getWritePacket(pkt->getAddr(), 64, data, requestorId);
+        if (sendMemReq(writePkt) &&
+            sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
+            applyWriteQueue.pop();
         }
-    if(!queue.empty() && !nextApplyEvent.scheduled()){
+    }else {
+        applyWriteQueue.pop();
+    }
+    if(!applyWriteQueue.empty() && !nextApplyEvent.scheduled()){
         schedule(nextApplyEvent, nextCycle());
     }
 }
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 56b43cfb7b..b7c0db90cb 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -45,53 +45,24 @@ namespace gem5
 class BaseApplyEngine : public ClockedObject
 {
   private:
-    //FIXME: Remove queue defenition from here.
-    struct ApplyQueue{
-        std::queue<PacketPtr> applyQueue;
-        const uint32_t queueSize;
-        bool sendPktRetry;
-
-        bool blocked(){
-            return (applyQueue.size() == queueSize);
-        }
-        bool empty(){
-            return applyQueue.empty();
-        }
-        void push(PacketPtr pkt){
-            applyQueue.push(pkt);
-        }
-
-        void pop(){
-            applyQueue.pop();
-        }
-
-        PacketPtr front(){
-            return applyQueue.front();
-        }
-
-        ApplyQueue(uint32_t qSize):
-          queueSize(qSize)
-        {}
-    };
 
     RequestorID requestorId;
 
-    ApplyQueue applyReadQueue;
-    ApplyQueue applyWriteQueue;
+    std::queue<Addr> applyReadQueue;
+    std::queue<PacketPtr> applyWriteQueue;
+    int queueSize;
 
     std::unordered_map<RequestPtr, int> requestOffset;
 
-    bool handleWL(PacketPtr pkt);
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
 
-    void handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
   protected:
     virtual bool sendMemReq(PacketPtr pkt) = 0;
-    virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
+    virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
     BaseApplyEngine(const BaseApplyEngineParams &apply);
@@ -101,6 +72,9 @@ class BaseApplyEngine : public ClockedObject
 
     RequestorID getRequestorId();
     void setRequestorId(RequestorID requestorId);
+
+    bool recvWLNotif(Addr addr);
+    bool handleMemResp(PacketPtr resp);
 };
 
 }
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index 6871154276..d93cbdf8da 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -33,10 +33,9 @@
 namespace gem5
 {
 
-BasePushEngine::BasePushEngine(const BasePushEngine &params) :
+BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
     ClockedObject(params),
-    requestorId(0),
-    memPort(name() + ".memPort", this),
+    requestorId(-1),
     // vertexQueueSize(params.vertex_queue_size),
     // vertexQueueLen(0),
     // updateQueue(params.update_queue_size),
@@ -65,44 +64,6 @@ BasePushEngine::setRequestorId(RequestorID requestorId)
     this->requestorId = requestorId;
 }
 
-void
-BasePushEngine::startup()
-{
-    //FIXME: This is the current version of our initializer.
-    // This should be updated in the future.
-    WorkListItem vertices [5] = {
-                                {0, 0, 3, 0}, // Addr: 0
-                                {0, 0, 1, 3}, // Addr: 16
-                                {0, 0, 1, 4}, // Addr: 32
-                                {0, 0, 0, 5}, // Addr: 48
-                                {0, 0, 0, 5}  // Addr: 64
-                                };
-    Edge edges [6] = {
-                    {0, 16}, // Addr: 1048576
-                    {0, 32}, // Addr: 1048592
-                    {0, 48}, // Addr: 1048608
-                    {0, 32}, // Addr: 1048624
-                    {0, 64}  // Addr: 1048640
-                    };
-
-    for (int i = 0; i < 5; i++) {
-        uint8_t* data = workListToMemory(vertices[i]);
-        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
-                                        16, data, requestorId);
-        memPort.sendFunctional(pkt);
-    }
-
-    for (int i = 0; i < 6; i++) {
-        uint8_t* data = edgeToMemory(edges[i]);
-        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
-                                        16, data, requestorId);
-        memPort.sendFunctional(pkt);
-    }
-
-}
-
-
-
 bool
 BasePushEngine::recvApplyNotif(uint32_t prop,
         uint32_t degree, uint32_t edge_index)
@@ -135,7 +96,7 @@ BasePushEngine::processNextReceiveEvent()
 
     for (uint32_t index = 0; index < notif.degree; index++) {
         // FIXME: For now the base edge address is 1048576
-        Addr edge_addr = 1048576 + (notif.edge_index + index) * sizeof(Edge);
+        Addr edge_addr = 1048576 + (notif.edgeIndex + index) * sizeof(Edge);
         Addr req_addr = (edge_addr / 64) * 64;
         Addr req_offset = edge_addr % 64;
         if (addr_queue.size()) {
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 873cb26b3d..c723932975 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -54,8 +54,6 @@ class BasePushEngine : public ClockedObject
         {}
     };
 
-    virtual void startup() override;
-
     RequestorID requestorId;
 
     std::queue<ApplyNotif> notifQueue;
@@ -72,24 +70,22 @@ class BasePushEngine : public ClockedObject
     // int updateQueueSize;
     // int updateQueueLen;
 
-    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
     EventFunctionWrapper nextReceiveEvent;
     void processNextReceiveEvent();
 
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
-    bool handleMemResp(PacketPtr pkt);
     EventFunctionWrapper nextSendEvent;
     void processNextSendEvent();
 
   protected:
-    virtual bool sendMemRequest(PacketPtr pkt) = 0;
+    virtual bool sendMemReq(PacketPtr pkt) = 0;
     virtual bool sendPushUpdate(PacketPtr pkt) = 0;
 
   public:
 
-    BasePushEngine(const PushEngineParams &params);
+    BasePushEngine(const BasePushEngineParams &params);
 
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
@@ -97,6 +93,8 @@ class BasePushEngine : public ClockedObject
     RequestorID getRequestorId();
     void setRequestorId(RequestorID requestorId);
 
+    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
+    bool handleMemResp(PacketPtr pkt);
 };
 
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index b863b38e19..806ab4a6c3 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -30,8 +30,6 @@
 
 #include <string>
 
-#include "accl/graph/base/util.hh"
-
 namespace gem5
 {
 
@@ -87,7 +85,7 @@ void BaseWLEngine::processNextWLReadEvent(){
         std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
-    if (sendMemReq()){
+    if (sendMemReq(memPkt)){
         queue.pop();
     }
     if(!queue.empty() && !nextWLReadEvent.scheduled()){
@@ -136,7 +134,7 @@ BaseWLEngine::processNextWLReduceEvent(){
         PacketPtr writePkt  =
         getWritePacket(pkt->getAddr(), 64, data, requestorId);
         if (sendMemReq(writePkt) &&
-            sendWLNotif(writePkt)) {
+            sendWLNotif(writePkt->getAddr())) {
             queue.pop();
             if (!queue.blocked() && queue.sendPktRetry){
                 queue.sendPktRetry = false;
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 3d807d8b06..a2cab4c7e2 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -91,7 +91,7 @@ class BaseWLEngine : public ClockedObject
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
-    void handleMemResp(PacketPtr resp);
+
     EventFunctionWrapper nextWLReduceEvent;
     void processNextWLReduceEvent();
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
@@ -101,7 +101,7 @@ class BaseWLEngine : public ClockedObject
     */
   protected:
     virtual bool sendMemReq(PacketPtr pkt) = 0;
-    virtual bool sendWLNotif(WorkListItem wl) = 0;
+    virtual bool sendWLNotif(Addr addr) = 0;
 
   public:
     BaseWLEngine(const BaseWLEngineParams &params);
@@ -111,7 +111,9 @@ class BaseWLEngine : public ClockedObject
 
     RequestorID getRequestorId();
     void setRequestorId(RequestorID requestorId);
+
     bool handleWLUpdate(PacketPtr pkt);
+    bool handleMemResp(PacketPtr resp);
 };
 
 }
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 046dfaf4e8..68cfb3d42d 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -38,9 +38,9 @@ class MPU(ClockedObject):
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = 'gem5::MPU'
 
-    workListEngine = Param.WLEngine("WLEngine object to connect to "
+    work_list_engine = Param.WLEngine("WLEngine object to connect to "
                     "This MPU")
-    applyEngine = Param.ApplyEngine("ApplyEngine object to connect to "
+    apply_engine = Param.ApplyEngine("ApplyEngine object to connect to "
                     "This MPU")
-    pushEngine = Param.PushEngine("PushEngine object to connect to "
+    push_engine = Param.PushEngine("PushEngine object to connect to "
                     "This MPU")
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index 64ae71e290..bc45850041 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -27,11 +27,13 @@
  */
 
 #include "accl/graph/sega/apply_engine.hh"
+#include "accl/graph/sega/mpu.hh"
 
 namespace gem5{
 
-ApplyEngine::ApplyEngine(const BaseApplyEngine &params):
-    BaseApplyEngine(params)
+ApplyEngine::ApplyEngine(const ApplyEngineParams &params) :
+    BaseApplyEngine(params),
+    mpu(params.mpu)
 {}
 
 bool
@@ -40,9 +42,9 @@ ApplyEngine::sendMemReq(PacketPtr pkt){
 }
 
 bool
-ApplyEngine::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){
+ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){
     mpu->recvApplyNotif(prop, degree, edgeIndex);
-
+    return true;
 }
 
 }
\ No newline at end of file
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index 855ebbd8b0..17e3280cb5 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -42,14 +42,21 @@
 namespace gem5
 {
 
+class MPU;
+
 class ApplyEngine : public BaseApplyEngine
 {
   private:
+
     MPU* mpu;
+
   protected:
+
     virtual bool sendMemReq(PacketPtr pkt);
-    virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+    virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+
   public:
+
     ApplyEngine(const ApplyEngineParams &params);
 };
 
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 27f7c8e314..4824bcd699 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -31,6 +31,31 @@
 namespace gem5
 {
 
+MPU::MPU(const MPUParams &params):
+    ClockedObject(params),
+    nextRequestorId(0),
+    respPort(name() + ".respPort", this),
+    reqPort(name() + ".reqPort", this),
+    memPort(name() + ".memPort", this),
+    applyEngine(params.apply_engine),
+    pushEngine(params.push_engine),
+    wlEngine(params.work_list_engine)
+{}
+
+Port&
+MPU::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "respPort") {
+        return respPort;
+    } else if (if_name == "reqPort") {
+        return reqPort;
+    } else if (if_name == "memPort") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
 void
 MPU::startup()
 {
@@ -43,6 +68,37 @@ MPU::startup()
     if (((int16_t) wlEngine->getRequestorId()) == -1) {
         wlEngine->setRequestorId(nextRequestorId++);
     }
+
+    //FIXME: This is the current version of our initializer.
+    // This should be updated in the future.
+    WorkListItem vertices [5] = {
+                                {0, 0, 3, 0}, // Addr: 0
+                                {0, 0, 1, 3}, // Addr: 16
+                                {0, 0, 1, 4}, // Addr: 32
+                                {0, 0, 0, 5}, // Addr: 48
+                                {0, 0, 0, 5}  // Addr: 64
+                                };
+    Edge edges [6] = {
+                    {0, 16}, // Addr: 1048576
+                    {0, 32}, // Addr: 1048592
+                    {0, 48}, // Addr: 1048608
+                    {0, 32}, // Addr: 1048624
+                    {0, 64}  // Addr: 1048640
+                    };
+
+    for (int i = 0; i < 5; i++) {
+        uint8_t* data = workListToMemory(vertices[i]);
+        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
+                                        16, data, 0);
+        memPort.sendFunctional(pkt);
+    }
+
+    for (int i = 0; i < 6; i++) {
+        uint8_t* data = edgeToMemory(edges[i]);
+        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
+                                        16, data, 0);
+        memPort.sendFunctional(pkt);
+    }
 }
 
 AddrRangeList
@@ -54,7 +110,7 @@ MPU::MPURespPort::getAddrRanges() const
 bool
 MPU::MPURespPort::recvTimingReq(PacketPtr pkt)
 {
-    return wlEngine->handleWLUpdate(pkt);
+    return owner->handleWLUpdate(pkt);
 }
 
 Tick
@@ -106,12 +162,6 @@ MPU::MPUReqPort::recvReqRetry()
     }
 }
 
-bool
-MPU::MPUMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
-
 void
 MPU::MPUMemPort::sendPacket(PacketPtr pkt)
 {
@@ -124,6 +174,14 @@ MPU::MPUMemPort::sendPacket(PacketPtr pkt)
     }
 }
 
+bool
+MPU::MPUMemPort::recvTimingResp(PacketPtr pkt)
+{
+    //TODO: Investigate sending true all the time
+    owner->handleMemResp(pkt);
+    return true;
+}
+
 void
 MPU::MPUMemPort::recvReqRetry()
 {
@@ -146,19 +204,21 @@ MPU::getAddrRanges()
 void
 MPU::recvFunctional(PacketPtr pkt)
 {
-    if (pkt->isUpdateWL()) {
-        panic("Functional requests should not be made to WL.")
+    if (pkt->cmd == MemCmd::UpdateWL) {
+        panic("Functional requests should not be made to WL.");
         //TODO: Might be a good idea to implement later.
         // wlEngine->recvFunctional(pkt);
     } else {
-        memPort.recvFuctional(pkt);
+        memPort.sendFunctional(pkt);
     }
 }
 
 bool
 MPU::handleMemReq(PacketPtr pkt)
 {
-    return memPort.recvTimingReq(pkt);
+    //TODO: Investigate sending true all the time
+    memPort.sendPacket(pkt);
+    return true;
 }
 
 void
@@ -177,33 +237,42 @@ MPU::handleMemResp(PacketPtr pkt)
 }
 
 bool
-MPU::recvWLNotif(WorkListItem wl)
+MPU::handleWLUpdate(PacketPtr pkt)
+{
+    return wlEngine->handleWLUpdate(pkt);
+}
+
+bool
+MPU::recvWLNotif(Addr addr)
 {
-    return applyEngine->recvWLUpdate(wl);
+    return applyEngine->recvWLNotif(addr);
 }
 
 bool
-MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
+MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index)
 {
-    return pushEngine->recvApplyUpdate(prop, degree, edge_index);
+    return pushEngine->recvApplyNotif(prop, degree, edge_index);
 }
 
 bool
 MPU::recvPushUpdate(PacketPtr pkt)
 {
     Addr addr = pkt->getAddr();
-    for (auto addr_range: memPort.getAddrRangeList()) {
+    for (auto addr_range: memPort.getAddrRanges()) {
         if (addr_range.contains(addr)) {
-            if (!memPort.sendPacket(pkt)) {
+            if (memPort.blocked()) {
                 return false;
+            } else {
+                memPort.sendPacket(pkt);
+                return true;
             }
-            return true;
         }
     }
 
-    if (!reqPort.sendPacket(pkt)) {
+    if (reqPort.blocked()) {
         return false;
     }
+    reqPort.sendPacket(pkt);
     return true;
 
 }
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index b37821c200..be5139c0e0 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -29,7 +29,6 @@
 #ifndef __ACCL_GRAPH_SEGA_MPU_HH__
 #define __ACCL_GRAPH_SEGA_MPU_HH__
 
-#include "accl/graph/base/util.hh"
 #include "accl/graph/sega/apply_engine.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "accl/graph/sega/wl_engine.hh"
@@ -91,7 +90,7 @@ class MPU : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        MemPort(const std::string& name, MPU* owner):
+        MPUMemPort(const std::string& name, MPU* owner):
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
@@ -119,21 +118,22 @@ class MPU : public ClockedObject
     AddrRangeList getAddrRanges();
     void recvFunctional(PacketPtr pkt);
 
-    bool handleMemReq(PacketPtr pkt);
-    void handleMemResp(PacketPtr pkt);
-
-    bool recvWLNotif(WorkListItem wl);
-    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
-    bool recvPushUpdate(PacketPtr pkt);
-
   public:
 
     MPU(const MPUParams &params);
 
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
+
+    bool handleMemReq(PacketPtr pkt);
+    void handleMemResp(PacketPtr pkt);
+
+    bool handleWLUpdate(PacketPtr pkt);
+    bool recvWLNotif(Addr addr);
+    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
+    bool recvPushUpdate(PacketPtr pkt);
 };
 
 }
 
-#endif // __ACCL_GRAPH_SEGA_MPU_HH__
\ No newline at end of file
+#endif // __ACCL_GRAPH_SEGA_MPU_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index e43512c6f4..922ae32ed2 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -27,13 +27,14 @@
  */
 
 #include "accl/graph/sega/push_engine.hh"
+#include "accl/graph/sega/mpu.hh"
 
 namespace gem5
 {
 
-PushEngine::PushEngine(const PushEngine &params) :
+PushEngine::PushEngine(const PushEngineParams &params) :
     BasePushEngine(params),
-    owner(params.mpu)
+    mpu(params.mpu)
 {
 }
 
@@ -44,15 +45,15 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 }
 
 bool
-PushEngine::sendMemReq(PacketPtr)
+PushEngine::sendMemReq(PacketPtr pkt)
 {
-    return owner->handleMemReq(pkt);
+    return mpu->handleMemReq(pkt);
 }
 
 bool
 PushEngine::sendPushUpdate(PacketPtr pkt)
 {
-    return owner->recvPushUpdate(pkt);
+    return mpu->recvPushUpdate(pkt);
 }
 
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 54ef72d5f9..e4bb83d2bc 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -30,16 +30,20 @@
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
 #include "accl/graph/base/base_push_engine.hh"
+#include "params/PushEngine.hh"
 
 namespace gem5
 {
+
+class MPU;
+
 class PushEngine : public BasePushEngine
 {
   private:
-    MPU* owner;
+    MPU* mpu;
 
   protected:
-    virtual bool sendMemRequest(PacketPtr pkt);
+    virtual bool sendMemReq(PacketPtr pkt);
     virtual bool sendPushUpdate(PacketPtr pkt);
 
   public:
@@ -48,8 +52,8 @@ class PushEngine : public BasePushEngine
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
-}
+};
 
 }
 
-#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
\ No newline at end of file
+#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 9608d0cbc4..40ec755969 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -27,24 +27,25 @@
  */
 
 #include "accl/graph/sega/wl_engine.hh"
-
-#include <string>
-
+#include "accl/graph/sega/mpu.hh"
 namespace gem5
 {
 
 WLEngine::WLEngine(const WLEngineParams &params):
-    BaseWLEngine(params)
+    BaseWLEngine(params),
+    mpu(params.mpu)
 {}
 
-virtual bool
+bool
 WLEngine::sendMemReq(PacketPtr pkt){
     return mpu->handleMemReq(pkt);
 }
 
 // FIXME: handle the case where Apply queue is full
-virtual bool
-WLEngine::sendWLNotif(WorkListItem wl){
-    mpu->recvWLNotif(wl);
+bool
+WLEngine::sendWLNotif(Addr addr){
+    mpu->recvWLNotif(addr);
     return true;
-}
\ No newline at end of file
+}
+
+}
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 938128e05f..c5f49ff6a2 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -45,16 +45,23 @@
 namespace gem5
 {
 
-class WLEngine : public BaseWorkListEngine
+// class MPU;
+
+class WLEngine : public BaseWLEngine
 {
   private:
+
     MPU* mpu;
+
   protected:
+
     virtual bool sendMemReq(PacketPtr pkt);
-    virtual bool sendWLNotif(WorkListItem wl);
+    virtual bool sendWLNotif(Addr addr);
+
   public:
+
     WLEngine(const WLEngineParams &params);
 };
 
 }
-#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
\ No newline at end of file
+#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 44c44d08a6..5332ee32a2 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -178,7 +178,6 @@ class MemCmd
         IsPrint,        //!< Print state matching address (for debugging)
         IsFlush,        //!< Flush the address from caches
         FromCache,      //!< Request originated from a caching agent
-        UpdateWL,       // MPU Accelerator
         NUM_COMMAND_ATTRIBUTES
     };
 
@@ -268,8 +267,6 @@ class MemCmd
                 cmd == ReadCleanReq || cmd == ReadSharedReq);
     }
 
-    bool isUpdateWL() const     {return testCmdAttrib(updateWL);}
-
     Command
     responseCommand() const
     {

From 8967f89ddfe20c155706993789344c5eff701d3c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 21 Feb 2022 14:59:31 -0800
Subject: [PATCH 045/247] Style fix.

---
 src/accl/graph/base/BaseApplyEngine.py | 2 +-
 src/accl/graph/base/BasePushEngine.py  | 1 -
 src/accl/graph/base/BaseWLEngine.py    | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index 45d94b3fd2..e48b425b01 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -31,7 +31,7 @@
 
 class BaseApplyEngine(ClockedObject):
     type = 'BaseApplyEngine'
-    cxx_header = "accl/graph/base/base_apply_engine.hh"
+    cxx_header = 'accl/graph/base/base_apply_engine.hh'
     cxx_class = 'gem5::BaseApplyEngine'
 
     applyQueueSize = Param.Unsigned(32, "Size of write queue")
diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
index 891221c06d..793b0a7c92 100644
--- a/src/accl/graph/base/BasePushEngine.py
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -33,4 +33,3 @@ class BasePushEngine(ClockedObject):
     type = 'BasePushEngine'
     cxx_header = "accl/graph/base/base_push_engine.hh"
     cxx_class = 'gem5::BasePushEngine'
-
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
index 3ecf030138..473fd05313 100644
--- a/src/accl/graph/base/BaseWLEngine.py
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -35,4 +35,3 @@ class BaseWLEngine(ClockedObject):
     cxx_class = 'gem5::BaseWLEngine'
 
     wlQueueSize = Param.Unsigned(32, "Size of write queue")
-

From fa48d321dd41debc82f39646adf23ad780ca05a7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 21 Feb 2022 15:20:51 -0800
Subject: [PATCH 046/247] Adding PARAMS macro.

---
 src/accl/graph/base/SConscript           | 6 +++---
 src/accl/graph/base/base_apply_engine.hh | 2 ++
 src/accl/graph/base/base_push_engine.hh  | 2 ++
 src/accl/graph/base/base_wl_engine.hh    | 3 +++
 src/accl/graph/sega/SConscript           | 8 ++++----
 src/accl/graph/sega/apply_engine.hh      | 2 +-
 src/accl/graph/sega/mpu.hh               | 2 +-
 src/accl/graph/sega/push_engine.hh       | 1 +
 src/accl/graph/sega/wl_engine.hh         | 2 +-
 9 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index cc55100064..5e82a44971 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,9 +27,9 @@
 
 Import('*')
 
-SimObject('BaseApplyEngine.py')
-SimObject('BasePushEngine.py')
-SimObject('BaseWLEngine.py')
+SimObject('BaseApplyEngine.py', sim_objects=["BaseApplyEngine"])
+SimObject('BasePushEngine.py', sim_objects=["BasePushEngine"])
+SimObject('BaseWLEngine.py', sim_objects=["BaseWLEngine"])
 
 Source('base_apply_engine.cc')
 Source('base_push_engine.cc')
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index b7c0db90cb..fbcf95c238 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -65,6 +65,8 @@ class BaseApplyEngine : public ClockedObject
     virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
+    PARAMS(BaseApplyEngine);
+
     BaseApplyEngine(const BaseApplyEngineParams &apply);
 
     Port& getPort(const std::string &if_name,
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index c723932975..446f6a1186 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -85,6 +85,8 @@ class BasePushEngine : public ClockedObject
 
   public:
 
+    PARAMS(BasePushEngine);
+
     BasePushEngine(const BasePushEngineParams &params);
 
     Port& getPort(const std::string &if_name,
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index a2cab4c7e2..4cb492914c 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -104,6 +104,9 @@ class BaseWLEngine : public ClockedObject
     virtual bool sendWLNotif(Addr addr) = 0;
 
   public:
+
+    PARAMS(BaseWLEngine);
+
     BaseWLEngine(const BaseWLEngineParams &params);
 
     Port& getPort(const std::string &if_name,
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index dc19ece06b..793dacc2ef 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -27,10 +27,10 @@
 
 Import('*')
 
-SimObject('ApplyEngine.py')
-SimObject('MPU.py')
-SimObject('PushEngine.py')
-SimObject('WLEngine.py')
+SimObject('ApplyEngine.py', sim_objects=["ApplyEngine"])
+SimObject('MPU.py', sim_objects=["MPU"])
+SimObject('PushEngine.py', sim_objects=["PushEngine"])
+SimObject('WLEngine.py', sim_objects=["WLEngine"])
 
 Source('apply_engine.cc')
 Source('mpu.cc')
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index 17e3280cb5..c7d3073e36 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -56,7 +56,7 @@ class ApplyEngine : public BaseApplyEngine
     virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
 
   public:
-
+    PARAMS(ApplyEngine);
     ApplyEngine(const ApplyEngineParams &params);
 };
 
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index be5139c0e0..cf241c9063 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -119,7 +119,7 @@ class MPU : public ClockedObject
     void recvFunctional(PacketPtr pkt);
 
   public:
-
+    PARAMS(MPU);
     MPU(const MPUParams &params);
 
     Port& getPort(const std::string &if_name,
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index e4bb83d2bc..1a800e58f3 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -47,6 +47,7 @@ class PushEngine : public BasePushEngine
     virtual bool sendPushUpdate(PacketPtr pkt);
 
   public:
+    PARAMS(PushEngine);
     PushEngine(const PushEngineParams &params);
 
     Port& getPort(const std::string &if_name,
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index c5f49ff6a2..238ffbe724 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -59,7 +59,7 @@ class WLEngine : public BaseWLEngine
     virtual bool sendWLNotif(Addr addr);
 
   public:
-
+    PARAMS(WLEngine);
     WLEngine(const WLEngineParams &params);
 };
 

From 9a5245c317917f60daf0eb400260ec5b11304f26 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 21 Feb 2022 15:33:13 -0800
Subject: [PATCH 047/247] First compilation after restructure.

---
 src/accl/graph/base/BaseApplyEngine.py | 1 +
 src/accl/graph/base/BasePushEngine.py  | 1 +
 src/accl/graph/base/BaseWLEngine.py    | 1 +
 src/accl/graph/base/SConscript         | 6 +++---
 src/accl/graph/sega/SConscript         | 8 ++++----
 5 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index e48b425b01..fdabefc732 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -30,6 +30,7 @@
 from m5.objects.ClockedObject import ClockedObject
 
 class BaseApplyEngine(ClockedObject):
+    abstract = True
     type = 'BaseApplyEngine'
     cxx_header = 'accl/graph/base/base_apply_engine.hh'
     cxx_class = 'gem5::BaseApplyEngine'
diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
index 793b0a7c92..d30124a6a4 100644
--- a/src/accl/graph/base/BasePushEngine.py
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -30,6 +30,7 @@
 from m5.objects.ClockedObject import ClockedObject
 
 class BasePushEngine(ClockedObject):
+    abstract = True
     type = 'BasePushEngine'
     cxx_header = "accl/graph/base/base_push_engine.hh"
     cxx_class = 'gem5::BasePushEngine'
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
index 473fd05313..7dcacefd97 100644
--- a/src/accl/graph/base/BaseWLEngine.py
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -30,6 +30,7 @@
 from m5.objects.ClockedObject import ClockedObject
 
 class BaseWLEngine(ClockedObject):
+    abstract = True
     type = 'BaseWLEngine'
     cxx_header = "accl/graph/base/base_wl_engine.hh"
     cxx_class = 'gem5::BaseWLEngine'
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 5e82a44971..cc55100064 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,9 +27,9 @@
 
 Import('*')
 
-SimObject('BaseApplyEngine.py', sim_objects=["BaseApplyEngine"])
-SimObject('BasePushEngine.py', sim_objects=["BasePushEngine"])
-SimObject('BaseWLEngine.py', sim_objects=["BaseWLEngine"])
+SimObject('BaseApplyEngine.py')
+SimObject('BasePushEngine.py')
+SimObject('BaseWLEngine.py')
 
 Source('base_apply_engine.cc')
 Source('base_push_engine.cc')
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 793dacc2ef..dc19ece06b 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -27,10 +27,10 @@
 
 Import('*')
 
-SimObject('ApplyEngine.py', sim_objects=["ApplyEngine"])
-SimObject('MPU.py', sim_objects=["MPU"])
-SimObject('PushEngine.py', sim_objects=["PushEngine"])
-SimObject('WLEngine.py', sim_objects=["WLEngine"])
+SimObject('ApplyEngine.py')
+SimObject('MPU.py')
+SimObject('PushEngine.py')
+SimObject('WLEngine.py')
 
 Source('apply_engine.cc')
 Source('mpu.cc')

From c3b4c743d4953d3648fca7dd384e0f8ed33006f2 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 22 Feb 2022 07:38:41 -0800
Subject: [PATCH 048/247] Adding config file for SEGA and missing ports.

---
 configs/accl/sega.py       | 34 ++++++++++++++++++++++++++++++++++
 src/accl/graph/sega/MPU.py | 10 +++++++---
 2 files changed, 41 insertions(+), 3 deletions(-)
 create mode 100644 configs/accl/sega.py

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
new file mode 100644
index 0000000000..288b1211e4
--- /dev/null
+++ b/configs/accl/sega.py
@@ -0,0 +1,34 @@
+import m5
+from m5.objects import *
+
+class PyMPU(MPU):
+    def __init__(self, clk_domain):
+        super().__init__()
+        self.clk_domain = clk_domain
+        self.apply_engine = ApplyEngine()
+        self.push_engine = PushEngine()
+        self.wl_engine = WLEngine()
+
+class SEGA(System):
+
+    def __init__(self):
+        super(SEGA, self).__init__()
+        # Set up the clock domain and the voltage domain
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = '2GHz'
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mpu = PyMPU(self.clk_domain)
+        self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB")))
+        self.mpu.memPort = self.mem_ctrl.port
+        self.mpu.reqPort = self.mpu.respPort
+
+
+system = SEGA()
+root = Root(full_system = False, system = system)
+
+m5.instantiate()
+
+exit_event = m5.simulate()
+print("Simulation finished!")
+exit()
\ No newline at end of file
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 68cfb3d42d..efd8dbc11f 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -38,9 +38,13 @@ class MPU(ClockedObject):
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = 'gem5::MPU'
 
-    work_list_engine = Param.WLEngine("WLEngine object to connect to "
+    apply_engine = Param.ApplyEngine(NULL, "ApplyEngine object to connect to "
                     "This MPU")
-    apply_engine = Param.ApplyEngine("ApplyEngine object to connect to "
+    push_engine = Param.PushEngine(NULL, "PushEngine object to connect to "
                     "This MPU")
-    push_engine = Param.PushEngine("PushEngine object to connect to "
+    work_list_engine = Param.WLEngine(NULL, "WLEngine object to connect to "
                     "This MPU")
+
+    respPort = ResponsePort("Port to Receive updates from outside")
+    reqPort  = RequestPort("Port to send updates to the outside")
+    memPort  = RequestPort("Port to communicate with the memory")

From 7be5866c0171399e8d5ef6851290dd61e7ef6fc9 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 22 Feb 2022 12:22:14 -0800
Subject: [PATCH 049/247] Adding BaseEngine class and started pointer fix.

---
 src/accl/graph/base/BaseApplyEngine.py   |  4 +-
 src/accl/graph/base/BaseEngine.py        | 38 ++++++++++
 src/accl/graph/base/BasePushEngine.py    |  2 +
 src/accl/graph/base/BaseWLEngine.py      |  1 +
 src/accl/graph/base/base_apply_engine.cc | 22 +-----
 src/accl/graph/base/base_apply_engine.hh |  9 +--
 src/accl/graph/base/base_engine.cc       | 75 ++++++++++++++++++++
 src/accl/graph/base/base_engine.hh       | 90 ++++++++++++++++++++++++
 src/accl/graph/sega/ApplyEngine.py       |  2 +-
 9 files changed, 213 insertions(+), 30 deletions(-)
 create mode 100644 src/accl/graph/base/BaseEngine.py
 create mode 100644 src/accl/graph/base/base_engine.cc
 create mode 100644 src/accl/graph/base/base_engine.hh

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index fdabefc732..be849ed1af 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -27,9 +27,9 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
+from m5.objects.BaseEngine import BaseEngine
 
-class BaseApplyEngine(ClockedObject):
+class BaseApplyEngine(BaseEngine):
     abstract = True
     type = 'BaseApplyEngine'
     cxx_header = 'accl/graph/base/base_apply_engine.hh'
diff --git a/src/accl/graph/base/BaseEngine.py b/src/accl/graph/base/BaseEngine.py
new file mode 100644
index 0000000000..3eb5f0cbbc
--- /dev/null
+++ b/src/accl/graph/base/BaseEngine.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseEngine(ClockedObject):
+    abstract = True
+    type = 'BaseEngine'
+    cxx_header = "accl/graph/base/base_engine.hh"
+    cxx_class = 'gem5::BaseEngine'
+
+    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
index d30124a6a4..c52a65abf9 100644
--- a/src/accl/graph/base/BasePushEngine.py
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -34,3 +34,5 @@ class BasePushEngine(ClockedObject):
     type = 'BasePushEngine'
     cxx_header = "accl/graph/base/base_push_engine.hh"
     cxx_class = 'gem5::BasePushEngine'
+
+    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
index 7dcacefd97..ec34b52005 100644
--- a/src/accl/graph/base/BaseWLEngine.py
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -36,3 +36,4 @@ class BaseWLEngine(ClockedObject):
     cxx_class = 'gem5::BaseWLEngine'
 
     wlQueueSize = Param.Unsigned(32, "Size of write queue")
+    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 731cd5c345..4fd53fb037 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -36,31 +36,12 @@ namespace gem5
 {
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
-    ClockedObject(params),
-    requestorId(-1),
+    BaseEngine(params),
     queueSize(params.applyQueueSize),
     nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
     nextApplyEvent([this]{ processNextApplyEvent(); }, name())
 {}
 
-Port &
-BaseApplyEngine::getPort(const std::string &if_name, PortID idx)
-{
-        return SimObject::getPort(if_name, idx);
-}
-
-RequestorID
-BaseApplyEngine::getRequestorId()
-{
-    return requestorId;
-}
-
-void
-BaseApplyEngine::setRequestorId(RequestorID requestorId)
-{
-    this->requestorId = requestorId;
-}
-
 bool BaseApplyEngine::recvWLNotif(Addr addr){
     // TODO: Investigate the situation where the queue is full.
     // if (applyReadQueue.size() == queueSize){
@@ -82,6 +63,7 @@ void BaseApplyEngine::processNextApplyCheckEvent(){
     RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
+    // FIXME: sendMemReq returns void, use memPortBlocked to check instead.
     if (sendMemReq(memPkt)){
         applyReadQueue.pop();
     }
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index fbcf95c238..f81f23428e 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -32,6 +32,7 @@
 #include <queue>
 #include <unordered_map>
 
+#include "accl/graph/base/base_engine.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
 #include "mem/request.hh"
@@ -42,12 +43,10 @@
 namespace gem5
 {
 
-class BaseApplyEngine : public ClockedObject
+class BaseApplyEngine : public BaseEngine
 {
   private:
 
-    RequestorID requestorId;
-
     std::queue<Addr> applyReadQueue;
     std::queue<PacketPtr> applyWriteQueue;
     int queueSize;
@@ -61,7 +60,6 @@ class BaseApplyEngine : public ClockedObject
     void processNextApplyEvent();
 
   protected:
-    virtual bool sendMemReq(PacketPtr pkt) = 0;
     virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
@@ -72,9 +70,6 @@ class BaseApplyEngine : public ClockedObject
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 
-    RequestorID getRequestorId();
-    void setRequestorId(RequestorID requestorId);
-
     bool recvWLNotif(Addr addr);
     bool handleMemResp(PacketPtr resp);
 };
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
new file mode 100644
index 0000000000..d53e2e683a
--- /dev/null
+++ b/src/accl/graph/base/base_engine.cc
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/base/base_engine.hh"
+
+namespace gem5
+{
+
+BaseEngine::BaseEngine(const BaseEngineParams &params) :
+    ClockedObject(params),
+    system(params.system),
+    requestorId(system->getRequestorId()),
+    memPort(name() + ".memPort", this)
+{}
+
+
+void
+BaseEngine::MemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+BaseEngine::MemPort::recvTimingResp(PacketPtr pkt)
+{
+    //TODO: Investigate sending true all the time
+    return owner->handleMemResp(pkt);
+
+}
+
+void
+BaseEngine::MemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+}
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
new file mode 100644
index 0000000000..f9f500e118
--- /dev/null
+++ b/src/accl/graph/base/base_engine.hh
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_BASE_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_ENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/BaseEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/port.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseEngine : public ClockedObject
+{
+  private:
+    class MemPort : public RequestPort
+    {
+      private:
+        BaseEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+        public:
+        MemPort(const std::string& name, BaseEngine* owner):
+            RequestPort(name, owner), owner(owner),
+            _blocked(false), blockedPacket(nullptr)
+        {}
+
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+        protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+    System* system;
+    const RequestorID requestorId;
+    MemPort memPort;
+
+  protected:
+    bool memPortBlocked() { return memPort.blocked(); }
+    void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); }
+    virtual bool handleMemResp(PacketPtr resp) = 0;
+
+  public:
+    PARAMS(BaseEngine);
+
+    BaseEngine(const BaseEngineParams &params);
+
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
+
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/ApplyEngine.py
index bb43836ff7..5bb0dc0c25 100644
--- a/src/accl/graph/sega/ApplyEngine.py
+++ b/src/accl/graph/sega/ApplyEngine.py
@@ -34,4 +34,4 @@ class ApplyEngine(BaseApplyEngine):
     cxx_header = "accl/graph/sega/apply_engine.hh"
     cxx_class = 'gem5::ApplyEngine'
 
-    mpu = Param.MPU(Parent.any, "MPU object that owns this ApplyEngine")
+    push_engine = Param.PushEngine(Parent.any, "MPU object that owns this ApplyEngine")

From 1bf60b6fa044f8913814d4234e4a209f6076fa1d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 22 Feb 2022 21:44:29 -0800
Subject: [PATCH 050/247] Cont. fixing pointer issue.

---
 src/accl/graph/base/BaseApplyEngine.py   |  2 -
 src/accl/graph/base/BaseWLEngine.py      |  7 +--
 src/accl/graph/base/base_apply_engine.cc | 53 ++++++++--------
 src/accl/graph/base/base_apply_engine.hh |  4 +-
 src/accl/graph/base/base_wl_engine.cc    | 79 +++++++-----------------
 src/accl/graph/base/base_wl_engine.hh    | 52 +++-------------
 6 files changed, 63 insertions(+), 134 deletions(-)

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index be849ed1af..9b240581ac 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -34,5 +34,3 @@ class BaseApplyEngine(BaseEngine):
     type = 'BaseApplyEngine'
     cxx_header = 'accl/graph/base/base_apply_engine.hh'
     cxx_class = 'gem5::BaseApplyEngine'
-
-    applyQueueSize = Param.Unsigned(32, "Size of write queue")
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
index ec34b52005..7311c396b3 100644
--- a/src/accl/graph/base/BaseWLEngine.py
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -27,13 +27,10 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
+from m5.objects.BaseEngine import BaseEngine
 
-class BaseWLEngine(ClockedObject):
+class BaseWLEngine(BaseEngine):
     abstract = True
     type = 'BaseWLEngine'
     cxx_header = "accl/graph/base/base_wl_engine.hh"
     cxx_class = 'gem5::BaseWLEngine'
-
-    wlQueueSize = Param.Unsigned(32, "Size of write queue")
-    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 4fd53fb037..7f6c32cf39 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -37,34 +37,35 @@ namespace gem5
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     BaseEngine(params),
-    queueSize(params.applyQueueSize),
     nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
     nextApplyEvent([this]{ processNextApplyEvent(); }, name())
 {}
 
-bool BaseApplyEngine::recvWLNotif(Addr addr){
+bool
+BaseApplyEngine::recvWLNotif(Addr addr)
+{
     // TODO: Investigate the situation where the queue is full.
-    // if (applyReadQueue.size() == queueSize){
-    //     //  applyReadQueue.sendPktRetry = true;
-    //     return true;
-    // } else{
     applyReadQueue.push(addr);
-    // }
     if (!nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
     }
     return true;
 }
 
-void BaseApplyEngine::processNextApplyCheckEvent(){
+void
+BaseApplyEngine::processNextApplyCheckEvent()
+{
+    // TODO: We might want to change the way this function
+    // pops items off queue, maybe we should pop every n cycles
+    // or change the clock domain for this simobject.
     Addr addr = applyReadQueue.front();
     Addr req_addr = (addr / 64) * 64;
     int req_offset = (addr % 64);
     RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
-    // FIXME: sendMemReq returns void, use memPortBlocked to check instead.
-    if (sendMemReq(memPkt)){
+    if (!memPortBlocked()) {
+        sendMemReq(memPkt);
         applyReadQueue.pop();
     }
     if (!applyReadQueue.empty() &&  !nextApplyCheckEvent.scheduled()){
@@ -75,7 +76,6 @@ void BaseApplyEngine::processNextApplyCheckEvent(){
 bool
 BaseApplyEngine::handleMemResp(PacketPtr pkt)
 {
-    // FIXME: change the event, remove the retry parts
     applyWriteQueue.push(pkt);
     if(!nextApplyEvent.scheduled()){
         schedule(nextApplyEvent, nextCycle());
@@ -84,38 +84,39 @@ BaseApplyEngine::handleMemResp(PacketPtr pkt)
 }
 
 void
-BaseApplyEngine::processNextApplyEvent(){
+BaseApplyEngine::processNextApplyEvent()
+{
     PacketPtr pkt = applyWriteQueue.front();
     uint8_t* data = pkt->getPtr<uint8_t>();
 
     RequestPtr request = pkt->req;
     int request_offset = requestOffset[request];
-    WorkListItem wl = memoryToWorkList(data + request_offset);
-    uint32_t prop = wl.prop;
-    uint32_t temp_prop = wl.temp_prop;
 
-    if (temp_prop != prop) {
+    WorkListItem wl = memoryToWorkList(data + request_offset);
+    // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem
+    // to applyengine if temp_prop < prop. If temp_prop has not changed, why
+    // fwd it to applyengine?
+    if (wl.temp_prop < wl.prop) {
         // TODO: instead of min add a Reduce function.
         //update prop with temp_prop
-        if(prop < temp_prop) {
-            wl.prop = prop;
-        }else {
-            wl.prop = temp_prop;
-        }
+        wl.prop = wl.temp_prop;
         //write back the new worklist item to  memory
         uint8_t* wList = workListToMemory(wl);
         memcpy(data + request_offset, wList, sizeof(WorkListItem));
         //Create memory write requests.
         PacketPtr writePkt  =
         getWritePacket(pkt->getAddr(), 64, data, requestorId);
-        if (sendMemReq(writePkt) &&
-            sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
-            applyWriteQueue.pop();
+
+        if (!memPortBlocked()) {
+            if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
+                sendMemReq(writePkt);
+                applyWriteQueue.pop();
+            }
         }
-    }else {
+    } else {
         applyWriteQueue.pop();
     }
-    if(!applyWriteQueue.empty() && !nextApplyEvent.scheduled()){
+    if (!applyWriteQueue.empty() && !nextApplyEvent.scheduled()){
         schedule(nextApplyEvent, nextCycle());
     }
 }
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index f81f23428e..dc7188ab56 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -37,7 +37,6 @@
 #include "mem/port.hh"
 #include "mem/request.hh"
 #include "params/BaseApplyEngine.hh"
-#include "sim/clocked_object.hh"
 #include "sim/port.hh"
 
 namespace gem5
@@ -60,6 +59,7 @@ class BaseApplyEngine : public BaseEngine
     void processNextApplyEvent();
 
   protected:
+    virtual bool handleMemResp(PacketPtr pkt);
     virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
@@ -71,7 +71,7 @@ class BaseApplyEngine : public BaseEngine
                   PortID idx=InvalidPortID) override;
 
     bool recvWLNotif(Addr addr);
-    bool handleMemResp(PacketPtr resp);
+
 };
 
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 806ab4a6c3..aab39fb7a3 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -34,61 +34,37 @@ namespace gem5
 {
 
 BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
-    ClockedObject(params),
-    requestorId(-1),
-    updateQueue(params.wlQueueSize),
-    responseQueue(params.wlQueueSize),
+    BaseEngine(params),
     nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()),
     nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name())
 {}
 
-Port &
-BaseWLEngine::getPort(const std::string &if_name, PortID idx)
-{
-    return SimObject::getPort(if_name, idx);
-}
-
-RequestorID
-BaseWLEngine::getRequestorId()
+bool
+BaseWLEngine::handleWLUpdate(PacketPtr pkt)
 {
-    return requestorId;
+    updateQueue.push(pkt);
+    if(!nextWLReadEvent.scheduled()) {
+        schedule(nextWLReadEvent, nextCycle());
+    }
+    return true;
 }
 
-void
-BaseWLEngine::setRequestorId(RequestorID requestorId)
+void BaseWLEngine::processNextWLReadEvent()
 {
-    this->requestorId = requestorId;
-}
+    PacketPtr pkt = updateQueue.front();
 
-bool
-BaseWLEngine::handleWLUpdate(PacketPtr pkt){
-    auto queue = updateQueue;
-    if (queue.blocked()){
-        queue.sendPktRetry = true;
-        return false;
-    } else
-        queue.push(pkt);
+    Addr addr = pkt->getAddr();
+    Addr req_addr = (addr / 64) * 64;
+    Addr req_offset = addr % 64;
 
-    if(!nextWLReadEvent.scheduled()){
-        schedule(nextWLReadEvent, nextCycle());
-    }
-    return true;
-}
+    PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
+    requestOffsetMap[request] = req_offset;
 
-void BaseWLEngine::processNextWLReadEvent(){
-    auto queue = updateQueue;
-    PacketPtr pkt = queue.front();
-    /// conver to ReadReq
-    Addr req_addr = (pkt->getAddr() / 64) * 64;
-    int req_offset = (pkt->getAddr()) % 64;
-    RequestPtr request =
-        std::make_shared<Request>(req_addr, 64, 0 ,0);
-    PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
-    requestOffset[request] = req_offset;
-    if (sendMemReq(memPkt)){
-        queue.pop();
+    if (memPortBlocked()) {
+        sendMemReq(memPkt)
+        updateQueue.pop();
     }
-    if(!queue.empty() && !nextWLReadEvent.scheduled()){
+    if (!queue.empty() && !nextWLReadEvent.scheduled()) {
         schedule(nextWLReadEvent, nextCycle());
     }
 }
@@ -96,24 +72,15 @@ void BaseWLEngine::processNextWLReadEvent(){
 bool
 BaseWLEngine::handleMemResp(PacketPtr pkt)
 {
-    auto queue = responseQueue;
-        if (queue.blocked()){
-            queue.sendPktRetry = true;
-            return false;
-        } else{
-            queue.push(pkt);
-        }
-        if(!nextWLReduceEvent.scheduled()){
-            schedule(nextWLReduceEvent, nextCycle());
-        }
-        return true;
+    responseQueue.push(pkt);
+    if(!nextWLReduceEvent.scheduled()){
+        schedule(nextWLReduceEvent, nextCycle());
+    }
     return true;
 }
 
 void
 BaseWLEngine::processNextWLReduceEvent(){
-    auto queue = responseQueue;
-    auto updateQ = updateQueue;
     PacketPtr update = updateQ.front();
     uint8_t* value = update->getPtr<uint8_t>();
     PacketPtr pkt = queue.front();
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 4cb492914c..063e9909be 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -32,57 +32,26 @@
 #include <queue>
 #include <unordered_map>
 
+#include "accl/graph/base/base_engine.hh"
 #include "accl/graph/base/util.hh"
 #include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
 #include "params/BaseWLEngine.hh"
-#include "sim/clocked_object.hh"
 #include "sim/port.hh"
 #include "sim/system.hh"
 
 namespace gem5
 {
 
-class BaseWLEngine : public ClockedObject
+class BaseWLEngine : public BaseEngine
 {
   private:
-    //FIXME: Change this
-    struct WLQueue{
-      std::queue<PacketPtr> wlQueue;
-      uint32_t queueSize;
-      bool sendPktRetry;
-
-      void resize(uint32_t size){
-        queueSize = size;
-      }
-
-      bool blocked(){
-        return (wlQueue.size() == queueSize);
-      }
-      bool empty(){
-        return wlQueue.empty();
-      }
-      void push(PacketPtr pkt){
-        wlQueue.push(pkt);
-      }
-      void pop(){
-        wlQueue.pop();
-      }
-      PacketPtr front(){
-        return wlQueue.front();
-      }
-
-      WLQueue(uint32_t qSize):
-        queueSize(qSize),
-        sendPktRetry(false){}
-    };
-
-    RequestorID requestorId;
-    WLQueue updateQueue;
-    WLQueue responseQueue;
-
-    std::unordered_map<RequestPtr, int> requestOffset;
+    std::queue<PacketPtr> updateQueue;
+    std::queue<PacketPtr> responseQueue;
+
+    std::unordered_map<RequestPtr, Addr> requestOffsetMap;
+    std::unordered_map<RequestPtr, uint32_t> requestValueMap;
 
     //Events
     EventFunctionWrapper nextWLReadEvent;
@@ -100,7 +69,7 @@ class BaseWLEngine : public ClockedObject
        Write edgelist loc in buffer
     */
   protected:
-    virtual bool sendMemReq(PacketPtr pkt) = 0;
+    virtual bool handleMemResp(PacketPtr resp);
     virtual bool sendWLNotif(Addr addr) = 0;
 
   public:
@@ -112,11 +81,8 @@ class BaseWLEngine : public ClockedObject
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 
-    RequestorID getRequestorId();
-    void setRequestorId(RequestorID requestorId);
-
     bool handleWLUpdate(PacketPtr pkt);
-    bool handleMemResp(PacketPtr resp);
+
 };
 
 }

From a8a3d0dc91778cbb21553938f7b3840e2d2af979 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 10:16:01 -0800
Subject: [PATCH 051/247] Cont. fix pointer issue.

---
 src/accl/graph/base/BasePushEngine.py    |  6 +--
 src/accl/graph/base/base_apply_engine.hh |  1 -
 src/accl/graph/base/base_push_engine.cc  | 19 -------
 src/accl/graph/base/base_push_engine.hh  | 19 ++-----
 src/accl/graph/base/base_wl_engine.cc    | 64 +++++++++---------------
 5 files changed, 31 insertions(+), 78 deletions(-)

diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
index c52a65abf9..2163864be3 100644
--- a/src/accl/graph/base/BasePushEngine.py
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -27,12 +27,10 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
+from m5.objects.BaseEngine import BaseEngine
 
-class BasePushEngine(ClockedObject):
+class BasePushEngine(BaseEngine):
     abstract = True
     type = 'BasePushEngine'
     cxx_header = "accl/graph/base/base_push_engine.hh"
     cxx_class = 'gem5::BasePushEngine'
-
-    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index dc7188ab56..2cb9d8b918 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -48,7 +48,6 @@ class BaseApplyEngine : public BaseEngine
 
     std::queue<Addr> applyReadQueue;
     std::queue<PacketPtr> applyWriteQueue;
-    int queueSize;
 
     std::unordered_map<RequestPtr, int> requestOffset;
 
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index d93cbdf8da..f2384c434b 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -35,7 +35,6 @@ namespace gem5
 
 BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
     ClockedObject(params),
-    requestorId(-1),
     // vertexQueueSize(params.vertex_queue_size),
     // vertexQueueLen(0),
     // updateQueue(params.update_queue_size),
@@ -46,24 +45,6 @@ BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
 {
 }
 
-Port &
-BasePushEngine::getPort(const std::string &if_name, PortID idx)
-{
-    return SimObject::getPort(if_name, idx);
-}
-
-RequestorID
-BasePushEngine::getRequestorId()
-{
-    return requestorId;
-}
-
-void
-BasePushEngine::setRequestorId(RequestorID requestorId)
-{
-    this->requestorId = requestorId;
-}
-
 bool
 BasePushEngine::recvApplyNotif(uint32_t prop,
         uint32_t degree, uint32_t edge_index)
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 446f6a1186..f568b6ecc3 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -31,16 +31,16 @@
 
 #include <queue>
 
+#include "accl/graph/base/base_engine.hh"
 #include "mem/port.hh"
 #include "mem/request.hh"
 #include "mem/packet.hh"
 #include "params/BasePushEngine.hh"
-#include "sim/clocked_object.hh"
 
 namespace gem5
 {
 
-class BasePushEngine : public ClockedObject
+class BasePushEngine : public BaseEngine
 {
   private:
 
@@ -53,9 +53,6 @@ class BasePushEngine : public ClockedObject
         prop(prop), degree(degree), edgeIndex(edge_index)
         {}
     };
-
-    RequestorID requestorId;
-
     std::queue<ApplyNotif> notifQueue;
     // int vertexQueueSize;
     // int vertexQueueLen;
@@ -64,8 +61,6 @@ class BasePushEngine : public ClockedObject
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
-    std::queue<PacketPtr> memReqQueue; // Infinite queueing?
-
     std::queue<PacketPtr> updateQueue;
     // int updateQueueSize;
     // int updateQueueLen;
@@ -80,8 +75,8 @@ class BasePushEngine : public ClockedObject
     void processNextSendEvent();
 
   protected:
-    virtual bool sendMemReq(PacketPtr pkt) = 0;
     virtual bool sendPushUpdate(PacketPtr pkt) = 0;
+    virtual bool handleMemResp(PacketPtr pkt);
 
   public:
 
@@ -89,14 +84,8 @@ class BasePushEngine : public ClockedObject
 
     BasePushEngine(const BasePushEngineParams &params);
 
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-
-    RequestorID getRequestorId();
-    void setRequestorId(RequestorID requestorId);
-
     bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
-    bool handleMemResp(PacketPtr pkt);
+
 };
 
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index aab39fb7a3..d5b18bafa0 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -52,13 +52,15 @@ BaseWLEngine::handleWLUpdate(PacketPtr pkt)
 void BaseWLEngine::processNextWLReadEvent()
 {
     PacketPtr pkt = updateQueue.front();
+    uint32_t data = *(pkt->getPtr<uint32_t>());
 
     Addr addr = pkt->getAddr();
     Addr req_addr = (addr / 64) * 64;
     Addr req_offset = addr % 64;
 
     PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
-    requestOffsetMap[request] = req_offset;
+    requestOffsetMap[memPkt->req] = req_offset;
+    requestValueMap[memPkt->req] = value;
 
     if (memPortBlocked()) {
         sendMemReq(memPkt)
@@ -80,51 +82,35 @@ BaseWLEngine::handleMemResp(PacketPtr pkt)
 }
 
 void
-BaseWLEngine::processNextWLReduceEvent(){
-    PacketPtr update = updateQ.front();
-    uint8_t* value = update->getPtr<uint8_t>();
-    PacketPtr pkt = queue.front();
-    uint8_t* data = pkt->getPtr<uint8_t>();
-    RequestPtr request = pkt->req;
-    int request_offset = requestOffset[request];
+BaseWLEngine::processNextWLReduceEvent()
+{
+    PacketPtr resp = responseQueue.front();
+    uint8_t* respData = resp->getPtr<uint8_t>();
+    Addr request_offset = requestOffsetMap[resp->req];
+    uint32_t value = requestValueMap[resp->req];
     WorkListItem wl =  memoryToWorkList(data + request_offset);
-    uint32_t temp_prop = wl.temp_prop;
-    if (temp_prop != *value){
+
+    if (value < wl.temp_prop){
         //update prop with temp_prop
-        if(*value < temp_prop){
-            temp_prop = *value;
-        }
-        // if (!memPort.blocked() && !applyPort.blocked()){
-        wl.temp_prop = temp_prop;
-        uint8_t* wlItem = workListToMemory(wl);
-        memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
+        wl.temp_prop = value;
+
+        uint8_t* wlData = workListToMemory(wl);
+        memcpy(respData + request_offset, wlData, sizeof(WorkListItem));
         PacketPtr writePkt  =
-        getWritePacket(pkt->getAddr(), 64, data, requestorId);
-        if (sendMemReq(writePkt) &&
-            sendWLNotif(writePkt->getAddr())) {
-            queue.pop();
-            if (!queue.blocked() && queue.sendPktRetry){
-                queue.sendPktRetry = false;
-            }
-            updateQ.pop();
-            if (!updateQ.blocked() & updateQ.sendPktRetry){
-                // respPort.trySendRetry();
-                updateQ.sendPktRetry = false;
+        getWritePacket(pkt->getAddr(), 64, respData, requestorId);
+
+        if (!memPortBlocked()) {
+            if (sendWLNotif(pkt->getAddr() + request_offset)) {
+                sendMemReq(writePkt);
+                responseQueue.pop();
+                // TODO: Erase map entries, delete wlData;
             }
         }
     }
-    else{
-        queue.pop();
-        if (!queue.blocked() && queue.sendPktRetry){
-            queue.sendPktRetry = false;
-        }
-        updateQ.pop();
-        if (!updateQ.blocked() & updateQ.sendPktRetry){
-            updateQ.sendPktRetry = false;
-        }
-
+    else {
+        responseQueue.pop();
     }
-    if (!queue.empty() && !nextWLReduceEvent.scheduled()){
+    if (!responseQueue.empty() && !nextWLReduceEvent.scheduled()){
             schedule(nextWLReduceEvent, nextCycle());
     }
 }

From 5a595540a569128ec01d730c25f4091a0a7c3a6f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 12:07:21 -0800
Subject: [PATCH 052/247] Cont. fix pointer issue. MemQ to BaseEngine.

---
 src/accl/graph/base/base_apply_engine.cc | 22 ++-----
 src/accl/graph/base/base_apply_engine.hh | 11 +---
 src/accl/graph/base/base_engine.cc       | 13 +++-
 src/accl/graph/base/base_engine.hh       | 17 +++++-
 src/accl/graph/base/base_push_engine.cc  | 77 ++++++------------------
 src/accl/graph/base/base_push_engine.hh  | 16 +----
 src/accl/graph/base/base_wl_engine.cc    | 22 ++-----
 src/accl/graph/base/base_wl_engine.hh    |  3 +-
 src/accl/graph/sega/mpu.hh               |  2 -
 9 files changed, 65 insertions(+), 118 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 7f6c32cf39..842481c2d1 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -73,20 +73,10 @@ BaseApplyEngine::processNextApplyCheckEvent()
     }
 }
 
-bool
-BaseApplyEngine::handleMemResp(PacketPtr pkt)
-{
-    applyWriteQueue.push(pkt);
-    if(!nextApplyEvent.scheduled()){
-        schedule(nextApplyEvent, nextCycle());
-    }
-    return true;
-}
-
 void
-BaseApplyEngine::processNextApplyEvent()
+BaseApplyEngine::processNextMemRespEvent()
 {
-    PacketPtr pkt = applyWriteQueue.front();
+    PacketPtr pkt = memRespQueue.front();
     uint8_t* data = pkt->getPtr<uint8_t>();
 
     RequestPtr request = pkt->req;
@@ -110,14 +100,14 @@ BaseApplyEngine::processNextApplyEvent()
         if (!memPortBlocked()) {
             if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
                 sendMemReq(writePkt);
-                applyWriteQueue.pop();
+                memRespQueue.pop();
             }
         }
     } else {
-        applyWriteQueue.pop();
+        memRespQueue.pop();
     }
-    if (!applyWriteQueue.empty() && !nextApplyEvent.scheduled()){
-        schedule(nextApplyEvent, nextCycle());
+    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()){
+        schedule(nextMemRespEvent, nextCycle());
     }
 }
 
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 2cb9d8b918..02646a74ff 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -45,21 +45,17 @@ namespace gem5
 class BaseApplyEngine : public BaseEngine
 {
   private:
-
     std::queue<Addr> applyReadQueue;
-    std::queue<PacketPtr> applyWriteQueue;
 
     std::unordered_map<RequestPtr, int> requestOffset;
 
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
 
-    EventFunctionWrapper nextApplyEvent;
-    void processNextApplyEvent();
-
   protected:
-    virtual bool handleMemResp(PacketPtr pkt);
-    virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
+    virtual bool sendApplyNotif(uint32_t prop,
+            uint32_t degree, uint32_t edgeIndex) = 0;
+    virtual void processNextMemRespEvent();
 
   public:
     PARAMS(BaseApplyEngine);
@@ -70,7 +66,6 @@ class BaseApplyEngine : public BaseEngine
                   PortID idx=InvalidPortID) override;
 
     bool recvWLNotif(Addr addr);
-
 };
 
 }
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index d53e2e683a..6a50e1630e 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -35,7 +35,8 @@ BaseEngine::BaseEngine(const BaseEngineParams &params) :
     ClockedObject(params),
     system(params.system),
     requestorId(system->getRequestorId()),
-    memPort(name() + ".memPort", this)
+    memPort(name() + ".memPort", this),
+    nextMemRespEvent([this] { processNextMemRespEvent(); }, name())
 {}
 
 
@@ -72,4 +73,14 @@ BaseEngine::MemPort::recvReqRetry()
     }
 }
 
+bool
+BaseEngine::handleMemResp(PacketPtr pkt)
+{
+    memRespQueue.push(pkt);
+    if (!nextMemResponseEvent.scheduled() && !memRespQueue.empty()) {
+        schedule(nextMemResponseEvent, nextCycle());
+    }
+    return true;
+}
+
 }
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
index f9f500e118..4f5a29676d 100644
--- a/src/accl/graph/base/base_engine.hh
+++ b/src/accl/graph/base/base_engine.hh
@@ -66,14 +66,28 @@ class BaseEngine : public ClockedObject
         virtual bool recvTimingResp(PacketPtr pkt);
         virtual void recvReqRetry();
     };
+
     System* system;
     const RequestorID requestorId;
     MemPort memPort;
 
+    bool handleMemResp(PacketPtr resp);
+    EventFunctionWrapper nextMemRespEvent;
+
   protected:
     bool memPortBlocked() { return memPort.blocked(); }
     void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); }
-    virtual bool handleMemResp(PacketPtr resp) = 0;
+
+    // TODO: Add this later, maybe?
+    // int memRespQueueSize;
+    std::queue<PacketPtr> memRespQueue;
+    /* Respective function for nextMemRespEvent.
+    All the classes inheriting from this class will
+    do their main processing in this function. For
+    example, BaseWLEngine reduces the temp_pro with
+    the value of update in this function.
+    */
+    virtual void processNextMemRespEvent() = 0;
 
   public:
     PARAMS(BaseEngine);
@@ -82,7 +96,6 @@ class BaseEngine : public ClockedObject
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
-
 };
 
 }
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index f2384c434b..4c43f95939 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -40,7 +40,6 @@ BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
     // updateQueue(params.update_queue_size),
     // updateQueueLen(0),
     nextReceiveEvent([this] { processNextReceiveEvent(); }, name()),
-    nextReadEvent([this] { processNextReadEvent(); }, name()),
     nextSendEvent([this] { processNextSendEvent(); }, name())
 {
 }
@@ -49,16 +48,6 @@ bool
 BasePushEngine::recvApplyNotif(uint32_t prop,
         uint32_t degree, uint32_t edge_index)
 {
-    //FIXME: There should be a check if the queues are full.
-    // if (vertexQueueLen < vertexQueueSize) {
-    //     vertexQueue.push(pkt)
-    //         vertexQueueLen++;
-    //     if (!nextReceiveEvent.scheduled()) {
-    //         schedule(nextReceiveEvent, nextCycle());
-    //     }
-    //     return true;
-    // }
-    // return false;
     notifQueue.emplace(prop, degree, edge_index);
     if (!nextReceiveEvent.scheduled()) {
         schedule(nextReceiveEvent, nextCycle());
@@ -67,7 +56,7 @@ BasePushEngine::recvApplyNotif(uint32_t prop,
 }
 
 void
-BasePushEngine::processNextReceiveEvent()
+BasePushEngine::processNextReadEvent()
 {
     ApplyNotif notif = notifQueue.front();
 
@@ -95,39 +84,28 @@ BasePushEngine::processNextReceiveEvent()
             offset_queue.push_back(req_offset);
             num_edge_queue.push_back(1);
         }
-    }
+    };
 
     for (int index = 0; index < addr_queue.size(); index++) {
-        PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
-        memReqQueue.push(pkt);
-        reqOffsetMap[pkt->req] = offset_queue[index];
-        reqNumEdgeMap[pkt->req] = num_edge_queue[index];
-        reqValueMap[pkt->req] = notif.prop;
+        if (!memPortBlocked()) {
+            PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
+            reqOffsetMap[pkt->req] = offset_queue[index];
+            reqNumEdgeMap[pkt->req] = num_edge_queue[index];
+            reqValueMap[pkt->req] = notif.prop;
+            sendMemReq(pkt);
+            notifQueue.pop();
+        }
     }
 
-    notifQueue.pop();
-
-    if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
+    if (!nextReadEvent.scheduled() && !notifQueue.empty()) {
         schedule(nextReadEvent, nextCycle());
     }
 }
 
 void
-BasePushEngine::processNextReadEvent()
-{
-    PacketPtr pkt = memReqQueue.front();
-    if (!sendMemReq(pkt)) {
-        memReqQueue.pop();
-    }
-
-    if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
-        schedule(nextReadEvent, nextCycle());
-    }
-}
-
-bool
-BasePushEngine::handleMemResp(PacketPtr pkt)
+BasePushEngine::processNextMemRespEvent()
 {
+    PacketPtr pkt = memRespQueue.front();
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
 
@@ -137,7 +115,7 @@ BasePushEngine::handleMemResp(PacketPtr pkt)
 
     int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
     for (int i = 0; i < num_edges; i++) {
-        uint8_t *curr_edge_data = data + offset + i * edge_in_bytes;
+        uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
         Edge e = memoryToEdge(curr_edge_data);
         uint32_t *update_data = new uint32_t;
 
@@ -146,29 +124,14 @@ BasePushEngine::handleMemResp(PacketPtr pkt)
         PacketPtr update = getUpdatePacket(e.neighbor,
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
             requestorId);
-        updateQueue.push(update);
-    }
-
-    if (!nextSendEvent.scheduled() && !updateQueue.empty()) {
-        schedule(nextSendEvent, nextCycle());
-    }
-
-    //TODO: Should we always return true? It's the response from the memory
-    // so maybe yes. We assume the receiving bandwidth of the PushEngine is
-    // higher than its demand bandwidth
-    return true;
-}
-
-void
-BasePushEngine::processNextSendEvent()
-{
-    PacketPtr pkt = updateQueue.front();
-    if (!sendPushUpdate(pkt)) {
-        updateQueue.pop();
+        if (sendPushUpdate(update)) {
+            memRespQueue.pop();
+            // TODO: Erase map entries here.
+        }
     }
 
-    if (!nextSendEvent.scheduled() && !updateQueue.empty()) {
-        schedule(nextSendEvent, nextCycle());
+    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()) {
+        schedule(nextMemRespEvent, nextCycle());
     }
 }
 
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index f568b6ecc3..5a6ef85b0f 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -43,7 +43,6 @@ namespace gem5
 class BasePushEngine : public BaseEngine
 {
   private:
-
     struct ApplyNotif {
         uint32_t prop;
         uint32_t degree;
@@ -53,30 +52,20 @@ class BasePushEngine : public BaseEngine
         prop(prop), degree(degree), edgeIndex(edge_index)
         {}
     };
+
     std::queue<ApplyNotif> notifQueue;
     // int vertexQueueSize;
-    // int vertexQueueLen;
 
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
-    std::queue<PacketPtr> updateQueue;
-    // int updateQueueSize;
-    // int updateQueueLen;
-
-    EventFunctionWrapper nextReceiveEvent;
-    void processNextReceiveEvent();
-
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
-    EventFunctionWrapper nextSendEvent;
-    void processNextSendEvent();
-
   protected:
     virtual bool sendPushUpdate(PacketPtr pkt) = 0;
-    virtual bool handleMemResp(PacketPtr pkt);
+    virtual void processNextMemRespEvent();
 
   public:
 
@@ -85,7 +74,6 @@ class BasePushEngine : public BaseEngine
     BasePushEngine(const BasePushEngineParams &params);
 
     bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
-
 };
 
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index d5b18bafa0..5d84e34ccd 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -63,7 +63,7 @@ void BaseWLEngine::processNextWLReadEvent()
     requestValueMap[memPkt->req] = value;
 
     if (memPortBlocked()) {
-        sendMemReq(memPkt)
+        sendMemReq(memPkt);
         updateQueue.pop();
     }
     if (!queue.empty() && !nextWLReadEvent.scheduled()) {
@@ -71,20 +71,10 @@ void BaseWLEngine::processNextWLReadEvent()
     }
 }
 
-bool
-BaseWLEngine::handleMemResp(PacketPtr pkt)
-{
-    responseQueue.push(pkt);
-    if(!nextWLReduceEvent.scheduled()){
-        schedule(nextWLReduceEvent, nextCycle());
-    }
-    return true;
-}
-
 void
-BaseWLEngine::processNextWLReduceEvent()
+BaseWLEngine::processNextMemRespEvent()
 {
-    PacketPtr resp = responseQueue.front();
+    PacketPtr resp = memRespQueue.front();
     uint8_t* respData = resp->getPtr<uint8_t>();
     Addr request_offset = requestOffsetMap[resp->req];
     uint32_t value = requestValueMap[resp->req];
@@ -102,15 +92,15 @@ BaseWLEngine::processNextWLReduceEvent()
         if (!memPortBlocked()) {
             if (sendWLNotif(pkt->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
-                responseQueue.pop();
+                memRespQueue.pop();
                 // TODO: Erase map entries, delete wlData;
             }
         }
     }
     else {
-        responseQueue.pop();
+        memRespQueue.pop();
     }
-    if (!responseQueue.empty() && !nextWLReduceEvent.scheduled()){
+    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()){
             schedule(nextWLReduceEvent, nextCycle());
     }
 }
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 063e9909be..ab8952de41 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -69,8 +69,8 @@ class BaseWLEngine : public BaseEngine
        Write edgelist loc in buffer
     */
   protected:
-    virtual bool handleMemResp(PacketPtr resp);
     virtual bool sendWLNotif(Addr addr) = 0;
+    virtual void processNextMemRespEvent();
 
   public:
 
@@ -82,7 +82,6 @@ class BaseWLEngine : public BaseEngine
                   PortID idx=InvalidPortID) override;
 
     bool handleWLUpdate(PacketPtr pkt);
-
 };
 
 }
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index cf241c9063..8b5ba20b1c 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -129,8 +129,6 @@ class MPU : public ClockedObject
     void handleMemResp(PacketPtr pkt);
 
     bool handleWLUpdate(PacketPtr pkt);
-    bool recvWLNotif(Addr addr);
-    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
     bool recvPushUpdate(PacketPtr pkt);
 };
 

From 4d2ad56c51ecfd4070a0800d9ec51cf5fc5aa225 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 12:21:51 -0800
Subject: [PATCH 053/247] Pointer issue fixed.

---
 src/accl/graph/sega/MPU.py          |  4 ---
 src/accl/graph/sega/WLEngine.py     |  3 +-
 src/accl/graph/sega/apply_engine.cc | 14 +++-----
 src/accl/graph/sega/apply_engine.hh |  7 ++--
 src/accl/graph/sega/mpu.cc          | 55 ++++-------------------------
 src/accl/graph/sega/mpu.hh          | 10 +-----
 src/accl/graph/sega/push_engine.cc  | 15 +-------
 src/accl/graph/sega/push_engine.hh  |  5 ---
 src/accl/graph/sega/wl_engine.cc    | 14 +++-----
 src/accl/graph/sega/wl_engine.hh    |  7 ++--
 10 files changed, 23 insertions(+), 111 deletions(-)

diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index efd8dbc11f..71b8841b10 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -38,12 +38,8 @@ class MPU(ClockedObject):
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = 'gem5::MPU'
 
-    apply_engine = Param.ApplyEngine(NULL, "ApplyEngine object to connect to "
-                    "This MPU")
     push_engine = Param.PushEngine(NULL, "PushEngine object to connect to "
                     "This MPU")
-    work_list_engine = Param.WLEngine(NULL, "WLEngine object to connect to "
-                    "This MPU")
 
     respPort = ResponsePort("Port to Receive updates from outside")
     reqPort  = RequestPort("Port to send updates to the outside")
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 12fbcf9b4f..3bfe9fa16f 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -34,4 +34,5 @@ class WLEngine(BaseWLEngine):
     cxx_header = "accl/graph/sega/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
-    mpu = Param.MPU(Parent.any, "MPU object that owns this WLEngine")
\ No newline at end of file
+    apply_engine = Param.ApplyEngine(Parent.any,
+            "MPU object that owns this WLEngine")
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index bc45850041..0f686e7f8c 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -27,24 +27,20 @@
  */
 
 #include "accl/graph/sega/apply_engine.hh"
-#include "accl/graph/sega/mpu.hh"
+#include "accl/graph/sega/push_engine.hh"
 
 namespace gem5{
 
 ApplyEngine::ApplyEngine(const ApplyEngineParams &params) :
     BaseApplyEngine(params),
-    mpu(params.mpu)
+    pushEngine(params.push_engine)
 {}
 
 bool
-ApplyEngine::sendMemReq(PacketPtr pkt){
-    return mpu->handleMemReq(pkt);
-}
+ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
+{
+    return push_engine->recvApplyNotif(prop, degree, edgeIndex);
 
-bool
-ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){
-    mpu->recvApplyNotif(prop, degree, edgeIndex);
-    return true;
 }
 
 }
\ No newline at end of file
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index c7d3073e36..4d828c6aa1 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -42,17 +42,14 @@
 namespace gem5
 {
 
-class MPU;
+class PushEngine;
 
 class ApplyEngine : public BaseApplyEngine
 {
   private:
-
-    MPU* mpu;
+    PushEngine* pushEngine;
 
   protected:
-
-    virtual bool sendMemReq(PacketPtr pkt);
     virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
 
   public:
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 4824bcd699..23a777d1c6 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -33,12 +33,9 @@ namespace gem5
 
 MPU::MPU(const MPUParams &params):
     ClockedObject(params),
-    nextRequestorId(0),
     respPort(name() + ".respPort", this),
     reqPort(name() + ".reqPort", this),
     memPort(name() + ".memPort", this),
-    applyEngine(params.apply_engine),
-    pushEngine(params.push_engine),
     wlEngine(params.work_list_engine)
 {}
 
@@ -59,16 +56,6 @@ MPU::getPort(const std::string &if_name, PortID idx)
 void
 MPU::startup()
 {
-    if (((int16_t) applyEngine->getRequestorId()) == -1) {
-        applyEngine->setRequestorId(nextRequestorId++);
-    }
-    if (((int16_t) pushEngine->getRequestorId()) == -1) {
-        pushEngine->setRequestorId(nextRequestorId++);
-    }
-    if (((int16_t) wlEngine->getRequestorId()) == -1) {
-        wlEngine->setRequestorId(nextRequestorId++);
-    }
-
     //FIXME: This is the current version of our initializer.
     // This should be updated in the future.
     WorkListItem vertices [5] = {
@@ -177,9 +164,7 @@ MPU::MPUMemPort::sendPacket(PacketPtr pkt)
 bool
 MPU::MPUMemPort::recvTimingResp(PacketPtr pkt)
 {
-    //TODO: Investigate sending true all the time
-    owner->handleMemResp(pkt);
-    return true;
+    panic("recvTimingResp called on MPU::MPUMemPort memPort.");
 }
 
 void
@@ -224,16 +209,7 @@ MPU::handleMemReq(PacketPtr pkt)
 void
 MPU::handleMemResp(PacketPtr pkt)
 {
-    RequestorID requestorId = pkt->requestorId();
-    if (applyEngine->getRequestorId() == requestorId) {
-        applyEngine->handleMemResp(pkt);
-    } else if (pushEngine->getRequestorId() == requestorId) {
-        pushEngine->handleMemResp(pkt);
-    } else if (wlEngine->getRequestorId() == requestorId) {
-        wlEngine->handleMemResp(pkt);
-    } else {
-        panic("Received a response with an unknown requestorId.");
-    }
+    panic("MPU::handleMemResp called!");
 }
 
 bool
@@ -242,39 +218,20 @@ MPU::handleWLUpdate(PacketPtr pkt)
     return wlEngine->handleWLUpdate(pkt);
 }
 
-bool
-MPU::recvWLNotif(Addr addr)
-{
-    return applyEngine->recvWLNotif(addr);
-}
-
-bool
-MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index)
-{
-    return pushEngine->recvApplyNotif(prop, degree, edge_index);
-}
-
 bool
 MPU::recvPushUpdate(PacketPtr pkt)
 {
     Addr addr = pkt->getAddr();
     for (auto addr_range: memPort.getAddrRanges()) {
         if (addr_range.contains(addr)) {
-            if (memPort.blocked()) {
-                return false;
-            } else {
-                memPort.sendPacket(pkt);
-                return true;
-            }
+            return handleWLUpdate(pkt);
         }
     }
-
-    if (reqPort.blocked()) {
-        return false;
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(pkt);
+        return true;
     }
-    reqPort.sendPacket(pkt);
     return true;
-
 }
 
 }
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 8b5ba20b1c..2df8993749 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -103,18 +103,13 @@ class MPU : public ClockedObject
         virtual void recvReqRetry();
     };
 
-    virtual void startup();
-
-    RequestorID nextRequestorId;
-
     MPURespPort respPort;
     MPUReqPort reqPort;
     MPUMemPort memPort;
 
-    ApplyEngine* applyEngine;
-    PushEngine* pushEngine;
     WLEngine* wlEngine;
 
+    virtual void startup();
     AddrRangeList getAddrRanges();
     void recvFunctional(PacketPtr pkt);
 
@@ -125,9 +120,6 @@ class MPU : public ClockedObject
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
-    bool handleMemReq(PacketPtr pkt);
-    void handleMemResp(PacketPtr pkt);
-
     bool handleWLUpdate(PacketPtr pkt);
     bool recvPushUpdate(PacketPtr pkt);
 };
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 922ae32ed2..71cb2955fd 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -35,20 +35,7 @@ namespace gem5
 PushEngine::PushEngine(const PushEngineParams &params) :
     BasePushEngine(params),
     mpu(params.mpu)
-{
-}
-
-Port &
-PushEngine::getPort(const std::string &if_name, PortID idx)
-{
-    return SimObject::getPort(if_name, idx);
-}
-
-bool
-PushEngine::sendMemReq(PacketPtr pkt)
-{
-    return mpu->handleMemReq(pkt);
-}
+{}
 
 bool
 PushEngine::sendPushUpdate(PacketPtr pkt)
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 1a800e58f3..7b3474d2ec 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -43,16 +43,11 @@ class PushEngine : public BasePushEngine
     MPU* mpu;
 
   protected:
-    virtual bool sendMemReq(PacketPtr pkt);
     virtual bool sendPushUpdate(PacketPtr pkt);
 
   public:
     PARAMS(PushEngine);
     PushEngine(const PushEngineParams &params);
-
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 40ec755969..3d9d7af0c6 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -27,25 +27,19 @@
  */
 
 #include "accl/graph/sega/wl_engine.hh"
-#include "accl/graph/sega/mpu.hh"
+#include "accl/graph/sega/apply_engine.hh"
+
 namespace gem5
 {
 
 WLEngine::WLEngine(const WLEngineParams &params):
     BaseWLEngine(params),
-    mpu(params.mpu)
+    applyEngine(params.apply_engine)
 {}
 
-bool
-WLEngine::sendMemReq(PacketPtr pkt){
-    return mpu->handleMemReq(pkt);
-}
-
-// FIXME: handle the case where Apply queue is full
 bool
 WLEngine::sendWLNotif(Addr addr){
-    mpu->recvWLNotif(addr);
-    return true;
+    apply_engine->recvWLNotif(addr);
 }
 
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 238ffbe724..c154867b0d 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -45,17 +45,14 @@
 namespace gem5
 {
 
-// class MPU;
+class ApplyEngine;
 
 class WLEngine : public BaseWLEngine
 {
   private:
-
-    MPU* mpu;
+    ApplyEngine* applyEngine;
 
   protected:
-
-    virtual bool sendMemReq(PacketPtr pkt);
     virtual bool sendWLNotif(Addr addr);
 
   public:

From 39883a68c9f8c2895ce9c0a5315dd9cf4eec7a9c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 12:24:07 -0800
Subject: [PATCH 054/247] Adding BaseEngine to SConscript.

---
 src/accl/graph/base/SConscript | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index cc55100064..41c48fc419 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -28,10 +28,12 @@
 Import('*')
 
 SimObject('BaseApplyEngine.py')
+SimObject('BaseEngine.py')
 SimObject('BasePushEngine.py')
 SimObject('BaseWLEngine.py')
 
 Source('base_apply_engine.cc')
+Source('base_engine.cc')
 Source('base_push_engine.cc')
 Source('base_wl_engine.cc')
 Source('util.cc')

From adfa21a1a8b9ee69b7e75dab14e8db2f1be7e2ca Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 12:43:55 -0800
Subject: [PATCH 055/247] Compilation issues fixed. Still linking issues.

---
 src/accl/graph/base/BaseEngine.py        |  1 +
 src/accl/graph/base/base_apply_engine.cc |  3 +--
 src/accl/graph/base/base_engine.cc       |  6 +++---
 src/accl/graph/base/base_engine.hh       | 14 +++++++-------
 src/accl/graph/base/base_push_engine.cc  | 16 +++++-----------
 src/accl/graph/base/base_wl_engine.cc    | 10 +++++-----
 src/accl/graph/sega/MPU.py               |  8 ++------
 src/accl/graph/sega/apply_engine.cc      |  3 +--
 src/accl/graph/sega/apply_engine.hh      |  2 +-
 src/accl/graph/sega/mpu.cc               | 14 --------------
 src/accl/graph/sega/mpu.hh               |  2 --
 src/accl/graph/sega/push_engine.cc       |  1 -
 src/accl/graph/sega/push_engine.hh       |  1 +
 src/accl/graph/sega/wl_engine.cc         |  3 +--
 src/accl/graph/sega/wl_engine.hh         |  1 +
 15 files changed, 29 insertions(+), 56 deletions(-)

diff --git a/src/accl/graph/base/BaseEngine.py b/src/accl/graph/base/BaseEngine.py
index 3eb5f0cbbc..367df8dbc1 100644
--- a/src/accl/graph/base/BaseEngine.py
+++ b/src/accl/graph/base/BaseEngine.py
@@ -35,4 +35,5 @@ class BaseEngine(ClockedObject):
     cxx_header = "accl/graph/base/base_engine.hh"
     cxx_class = 'gem5::BaseEngine'
 
+    system = Param.System(Parent.any, 'System this Engine is a part of')
     memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 842481c2d1..b7f3030e00 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -37,8 +37,7 @@ namespace gem5
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     BaseEngine(params),
-    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
-    nextApplyEvent([this]{ processNextApplyEvent(); }, name())
+    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name())
 {}
 
 bool
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index 6a50e1630e..06827c1d4e 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -34,8 +34,8 @@ namespace gem5
 BaseEngine::BaseEngine(const BaseEngineParams &params) :
     ClockedObject(params),
     system(params.system),
-    requestorId(system->getRequestorId()),
     memPort(name() + ".memPort", this),
+    requestorId(system->getRequestorId(this)),
     nextMemRespEvent([this] { processNextMemRespEvent(); }, name())
 {}
 
@@ -77,8 +77,8 @@ bool
 BaseEngine::handleMemResp(PacketPtr pkt)
 {
     memRespQueue.push(pkt);
-    if (!nextMemResponseEvent.scheduled() && !memRespQueue.empty()) {
-        schedule(nextMemResponseEvent, nextCycle());
+    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()) {
+        schedule(nextMemRespEvent, nextCycle());
     }
     return true;
 }
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
index 4f5a29676d..057a4c6d91 100644
--- a/src/accl/graph/base/base_engine.hh
+++ b/src/accl/graph/base/base_engine.hh
@@ -68,25 +68,25 @@ class BaseEngine : public ClockedObject
     };
 
     System* system;
-    const RequestorID requestorId;
     MemPort memPort;
 
     bool handleMemResp(PacketPtr resp);
-    EventFunctionWrapper nextMemRespEvent;
 
   protected:
-    bool memPortBlocked() { return memPort.blocked(); }
-    void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); }
-
+    const RequestorID requestorId;
     // TODO: Add this later, maybe?
     // int memRespQueueSize;
     std::queue<PacketPtr> memRespQueue;
-    /* Respective function for nextMemRespEvent.
-    All the classes inheriting from this class will
+
+    bool memPortBlocked() { return memPort.blocked(); }
+    void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); }
+
+    /* All the classes inheriting from this class will
     do their main processing in this function. For
     example, BaseWLEngine reduces the temp_pro with
     the value of update in this function.
     */
+    EventFunctionWrapper nextMemRespEvent;
     virtual void processNextMemRespEvent() = 0;
 
   public:
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index 4c43f95939..187eefe01b 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -34,23 +34,17 @@ namespace gem5
 {
 
 BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
-    ClockedObject(params),
-    // vertexQueueSize(params.vertex_queue_size),
-    // vertexQueueLen(0),
-    // updateQueue(params.update_queue_size),
-    // updateQueueLen(0),
-    nextReceiveEvent([this] { processNextReceiveEvent(); }, name()),
-    nextSendEvent([this] { processNextSendEvent(); }, name())
-{
-}
+    BaseEngine(params),
+    nextReadEvent([this] { processNextReadEvent(); }, name())
+{}
 
 bool
 BasePushEngine::recvApplyNotif(uint32_t prop,
         uint32_t degree, uint32_t edge_index)
 {
     notifQueue.emplace(prop, degree, edge_index);
-    if (!nextReceiveEvent.scheduled()) {
-        schedule(nextReceiveEvent, nextCycle());
+    if (!nextReadEvent.scheduled()) {
+        schedule(nextReadEvent, nextCycle());
     }
     return true;
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 5d84e34ccd..20abaa7b20 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -52,7 +52,7 @@ BaseWLEngine::handleWLUpdate(PacketPtr pkt)
 void BaseWLEngine::processNextWLReadEvent()
 {
     PacketPtr pkt = updateQueue.front();
-    uint32_t data = *(pkt->getPtr<uint32_t>());
+    uint32_t value = *(pkt->getPtr<uint32_t>());
 
     Addr addr = pkt->getAddr();
     Addr req_addr = (addr / 64) * 64;
@@ -66,7 +66,7 @@ void BaseWLEngine::processNextWLReadEvent()
         sendMemReq(memPkt);
         updateQueue.pop();
     }
-    if (!queue.empty() && !nextWLReadEvent.scheduled()) {
+    if (!nextWLReadEvent.scheduled() && !updateQueue.empty()) {
         schedule(nextWLReadEvent, nextCycle());
     }
 }
@@ -78,7 +78,7 @@ BaseWLEngine::processNextMemRespEvent()
     uint8_t* respData = resp->getPtr<uint8_t>();
     Addr request_offset = requestOffsetMap[resp->req];
     uint32_t value = requestValueMap[resp->req];
-    WorkListItem wl =  memoryToWorkList(data + request_offset);
+    WorkListItem wl =  memoryToWorkList(respData + request_offset);
 
     if (value < wl.temp_prop){
         //update prop with temp_prop
@@ -87,10 +87,10 @@ BaseWLEngine::processNextMemRespEvent()
         uint8_t* wlData = workListToMemory(wl);
         memcpy(respData + request_offset, wlData, sizeof(WorkListItem));
         PacketPtr writePkt  =
-        getWritePacket(pkt->getAddr(), 64, respData, requestorId);
+        getWritePacket(resp->getAddr(), 64, respData, requestorId);
 
         if (!memPortBlocked()) {
-            if (sendWLNotif(pkt->getAddr() + request_offset)) {
+            if (sendWLNotif(resp->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
                 memRespQueue.pop();
                 // TODO: Erase map entries, delete wlData;
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 71b8841b10..87de0fb7d6 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -28,18 +28,14 @@
 from m5.params import *
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
-
-# from m5.objects.WLEngine import WLEngine
-# from m5.objects.PushEngine import PushEngine
-# from m5.objects.ApplyEngine import ApplyEngine
+from m5.objects.WLEngine import WLEngine
 
 class MPU(ClockedObject):
     type = 'MPU'
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = 'gem5::MPU'
 
-    push_engine = Param.PushEngine(NULL, "PushEngine object to connect to "
-                    "This MPU")
+    work_list_engine = Param.WLEngine(NULL, "WLEngine to connect to this MPU")
 
     respPort = ResponsePort("Port to Receive updates from outside")
     reqPort  = RequestPort("Port to send updates to the outside")
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index 0f686e7f8c..bc3d703cf6 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -27,7 +27,6 @@
  */
 
 #include "accl/graph/sega/apply_engine.hh"
-#include "accl/graph/sega/push_engine.hh"
 
 namespace gem5{
 
@@ -39,7 +38,7 @@ ApplyEngine::ApplyEngine(const ApplyEngineParams &params) :
 bool
 ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
 {
-    return push_engine->recvApplyNotif(prop, degree, edgeIndex);
+    return pushEngine->recvApplyNotif(prop, degree, edgeIndex);
 
 }
 
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index 4d828c6aa1..aff2c5417b 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -33,6 +33,7 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_apply_engine.hh"
+#include "accl/graph/sega/push_engine.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
 #include "params/ApplyEngine.hh"
@@ -42,7 +43,6 @@
 namespace gem5
 {
 
-class PushEngine;
 
 class ApplyEngine : public BaseApplyEngine
 {
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 23a777d1c6..9bda696cb5 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -198,20 +198,6 @@ MPU::recvFunctional(PacketPtr pkt)
     }
 }
 
-bool
-MPU::handleMemReq(PacketPtr pkt)
-{
-    //TODO: Investigate sending true all the time
-    memPort.sendPacket(pkt);
-    return true;
-}
-
-void
-MPU::handleMemResp(PacketPtr pkt)
-{
-    panic("MPU::handleMemResp called!");
-}
-
 bool
 MPU::handleWLUpdate(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 2df8993749..a0472eead5 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -29,8 +29,6 @@
 #ifndef __ACCL_GRAPH_SEGA_MPU_HH__
 #define __ACCL_GRAPH_SEGA_MPU_HH__
 
-#include "accl/graph/sega/apply_engine.hh"
-#include "accl/graph/sega/push_engine.hh"
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/addr_range.hh"
 #include "mem/port.hh"
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 71cb2955fd..a1fa86da2b 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -27,7 +27,6 @@
  */
 
 #include "accl/graph/sega/push_engine.hh"
-#include "accl/graph/sega/mpu.hh"
 
 namespace gem5
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 7b3474d2ec..edf698011d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -30,6 +30,7 @@
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
 #include "accl/graph/base/base_push_engine.hh"
+#include "accl/graph/sega/mpu.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 3d9d7af0c6..823aa49bb9 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -27,7 +27,6 @@
  */
 
 #include "accl/graph/sega/wl_engine.hh"
-#include "accl/graph/sega/apply_engine.hh"
 
 namespace gem5
 {
@@ -39,7 +38,7 @@ WLEngine::WLEngine(const WLEngineParams &params):
 
 bool
 WLEngine::sendWLNotif(Addr addr){
-    apply_engine->recvWLNotif(addr);
+    return applyEngine->recvWLNotif(addr);
 }
 
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index c154867b0d..6946713aaa 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -33,6 +33,7 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_wl_engine.hh"
+#include "accl/graph/sega/apply_engine.hh"
 #include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"

From 05771a071f7016fe66fc0da8e551ef793ac0c059 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 13:19:29 -0800
Subject: [PATCH 056/247] Removing unnecessary includes.

---
 src/accl/graph/base/base_apply_engine.cc | 4 ++--
 src/accl/graph/base/base_apply_engine.hh | 5 +----
 src/accl/graph/base/base_engine.hh       | 4 +---
 src/accl/graph/base/base_push_engine.hh  | 4 +---
 src/accl/graph/base/base_wl_engine.hh    | 8 --------
 5 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index b7f3030e00..009c01ccb7 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -59,7 +59,7 @@ BaseApplyEngine::processNextApplyCheckEvent()
     // or change the clock domain for this simobject.
     Addr addr = applyReadQueue.front();
     Addr req_addr = (addr / 64) * 64;
-    int req_offset = (addr % 64);
+    Addr req_offset = (addr % 64);
     RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
@@ -79,7 +79,7 @@ BaseApplyEngine::processNextMemRespEvent()
     uint8_t* data = pkt->getPtr<uint8_t>();
 
     RequestPtr request = pkt->req;
-    int request_offset = requestOffset[request];
+    Addr request_offset = requestOffset[request];
 
     WorkListItem wl = memoryToWorkList(data + request_offset);
     // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 02646a74ff..e3fe47d923 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -33,11 +33,8 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_engine.hh"
-#include "mem/packet.hh"
-#include "mem/port.hh"
 #include "mem/request.hh"
 #include "params/BaseApplyEngine.hh"
-#include "sim/port.hh"
 
 namespace gem5
 {
@@ -47,7 +44,7 @@ class BaseApplyEngine : public BaseEngine
   private:
     std::queue<Addr> applyReadQueue;
 
-    std::unordered_map<RequestPtr, int> requestOffset;
+    std::unordered_map<RequestPtr, Addr> requestOffset;
 
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
index 057a4c6d91..b0b05d9477 100644
--- a/src/accl/graph/base/base_engine.hh
+++ b/src/accl/graph/base/base_engine.hh
@@ -34,10 +34,8 @@
 
 #include "mem/packet.hh"
 #include "mem/port.hh"
-#include "mem/request.hh"
 #include "params/BaseEngine.hh"
 #include "sim/clocked_object.hh"
-#include "sim/port.hh"
 #include "sim/system.hh"
 
 namespace gem5
@@ -79,7 +77,7 @@ class BaseEngine : public ClockedObject
     std::queue<PacketPtr> memRespQueue;
 
     bool memPortBlocked() { return memPort.blocked(); }
-    void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); }
+    void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
 
     /* All the classes inheriting from this class will
     do their main processing in this function. For
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 5a6ef85b0f..0da4241dfd 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -32,9 +32,7 @@
 #include <queue>
 
 #include "accl/graph/base/base_engine.hh"
-#include "mem/port.hh"
 #include "mem/request.hh"
-#include "mem/packet.hh"
 #include "params/BasePushEngine.hh"
 
 namespace gem5
@@ -54,7 +52,7 @@ class BasePushEngine : public BaseEngine
     };
 
     std::queue<ApplyNotif> notifQueue;
-    // int vertexQueueSize;
+    // int notifQueueSize;
 
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index ab8952de41..3ca9a146a1 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -34,12 +34,7 @@
 
 #include "accl/graph/base/base_engine.hh"
 #include "accl/graph/base/util.hh"
-#include "base/addr_range.hh"
-#include "mem/port.hh"
-#include "mem/packet.hh"
 #include "params/BaseWLEngine.hh"
-#include "sim/port.hh"
-#include "sim/system.hh"
 
 namespace gem5
 {
@@ -78,9 +73,6 @@ class BaseWLEngine : public BaseEngine
 
     BaseWLEngine(const BaseWLEngineParams &params);
 
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
-
     bool handleWLUpdate(PacketPtr pkt);
 };
 

From 01b4b2a5a80247c969243bbb52bbbe9bd4ef41f8 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 13:51:30 -0800
Subject: [PATCH 057/247] Fixing the issue of calling pure virtual function.

---
 src/accl/graph/base/base_apply_engine.cc | 17 +++++++++++++----
 src/accl/graph/base/base_apply_engine.hh |  6 +++++-
 src/accl/graph/base/base_engine.cc       |  7 ++-----
 src/accl/graph/base/base_engine.hh       |  8 +-------
 src/accl/graph/base/base_push_engine.cc  | 17 +++++++++++++----
 src/accl/graph/base/base_push_engine.hh  |  5 ++++-
 src/accl/graph/base/base_wl_engine.cc    | 13 +++++++++++--
 src/accl/graph/base/base_wl_engine.hh    |  2 +-
 8 files changed, 50 insertions(+), 25 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 009c01ccb7..e7b7dd6a22 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -37,7 +37,8 @@ namespace gem5
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     BaseEngine(params),
-    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name())
+    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
+    nextApplyEvent([this]{ processNextApplyEvent(); }, name())
 {}
 
 bool
@@ -73,7 +74,7 @@ BaseApplyEngine::processNextApplyCheckEvent()
 }
 
 void
-BaseApplyEngine::processNextMemRespEvent()
+BaseApplyEngine::processNextApplyEvent()
 {
     PacketPtr pkt = memRespQueue.front();
     uint8_t* data = pkt->getPtr<uint8_t>();
@@ -105,8 +106,16 @@ BaseApplyEngine::processNextMemRespEvent()
     } else {
         memRespQueue.pop();
     }
-    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()){
-        schedule(nextMemRespEvent, nextCycle());
+    if (!nextApplyEvent.scheduled() && !memRespQueue.empty()){
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+BaseApplyEngine::scheduleMainEvent()
+{
+    if (!memRespQueue.empty() && !nextApplyEvent.scheduled()) {
+        schedule(nextApplyEvent, nextCycle());
     }
 }
 
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index e3fe47d923..486fb687fe 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -49,10 +49,14 @@ class BaseApplyEngine : public BaseEngine
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
 
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
+
   protected:
     virtual bool sendApplyNotif(uint32_t prop,
             uint32_t degree, uint32_t edgeIndex) = 0;
-    virtual void processNextMemRespEvent();
+
+    virtual void scheduleMainEvent();
 
   public:
     PARAMS(BaseApplyEngine);
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index 06827c1d4e..245192643c 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -35,8 +35,7 @@ BaseEngine::BaseEngine(const BaseEngineParams &params) :
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".memPort", this),
-    requestorId(system->getRequestorId(this)),
-    nextMemRespEvent([this] { processNextMemRespEvent(); }, name())
+    requestorId(system->getRequestorId(this))
 {}
 
 
@@ -77,9 +76,7 @@ bool
 BaseEngine::handleMemResp(PacketPtr pkt)
 {
     memRespQueue.push(pkt);
-    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()) {
-        schedule(nextMemRespEvent, nextCycle());
-    }
+    scheduleMainEvent();
     return true;
 }
 
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
index b0b05d9477..3436229aa1 100644
--- a/src/accl/graph/base/base_engine.hh
+++ b/src/accl/graph/base/base_engine.hh
@@ -79,13 +79,7 @@ class BaseEngine : public ClockedObject
     bool memPortBlocked() { return memPort.blocked(); }
     void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
 
-    /* All the classes inheriting from this class will
-    do their main processing in this function. For
-    example, BaseWLEngine reduces the temp_pro with
-    the value of update in this function.
-    */
-    EventFunctionWrapper nextMemRespEvent;
-    virtual void processNextMemRespEvent() = 0;
+    virtual void scheduleMainEvent() = 0;
 
   public:
     PARAMS(BaseEngine);
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index 187eefe01b..a963cc9709 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -35,7 +35,8 @@ namespace gem5
 
 BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
     BaseEngine(params),
-    nextReadEvent([this] { processNextReadEvent(); }, name())
+    nextReadEvent([this] { processNextReadEvent(); }, name()),
+    nextPushEvent([this] { processNextPushEvent(); }, name())
 {}
 
 bool
@@ -97,7 +98,7 @@ BasePushEngine::processNextReadEvent()
 }
 
 void
-BasePushEngine::processNextMemRespEvent()
+BasePushEngine::processNextPushEvent()
 {
     PacketPtr pkt = memRespQueue.front();
     RequestPtr req = pkt->req;
@@ -124,8 +125,16 @@ BasePushEngine::processNextMemRespEvent()
         }
     }
 
-    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()) {
-        schedule(nextMemRespEvent, nextCycle());
+    if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
+        schedule(nextPushEvent, nextCycle());
+    }
+}
+
+void
+BasePushEngine::scheduleMainEvent()
+{
+    if (!memRespQueue.empty() && !nextPushEvent.scheduled()) {
+        schedule(nextPushEvent, nextCycle());
     }
 }
 
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 0da4241dfd..8bb7d6663a 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -61,9 +61,12 @@ class BasePushEngine : public BaseEngine
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
+    EventFunctionWrapper nextPushEvent;
+    void processNextPushEvent();
+
   protected:
     virtual bool sendPushUpdate(PacketPtr pkt) = 0;
-    virtual void processNextMemRespEvent();
+    virtual void scheduleMainEvent();
 
   public:
 
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 20abaa7b20..ef66603de7 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -72,7 +72,7 @@ void BaseWLEngine::processNextWLReadEvent()
 }
 
 void
-BaseWLEngine::processNextMemRespEvent()
+BaseWLEngine::processNextWLReduceEvent()
 {
     PacketPtr resp = memRespQueue.front();
     uint8_t* respData = resp->getPtr<uint8_t>();
@@ -100,9 +100,18 @@ BaseWLEngine::processNextMemRespEvent()
     else {
         memRespQueue.pop();
     }
-    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()){
+    if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){
             schedule(nextWLReduceEvent, nextCycle());
     }
 }
 
+void
+BaseWLEngine::scheduleMainEvent()
+{
+    if (!memRespQueue.empty() && !nextWLReduceEvent.scheduled()) {
+        schedule(nextWLReduceEvent, nextCycle());
+    }
+}
+
+
 }
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 3ca9a146a1..a5070f0b26 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -65,7 +65,7 @@ class BaseWLEngine : public BaseEngine
     */
   protected:
     virtual bool sendWLNotif(Addr addr) = 0;
-    virtual void processNextMemRespEvent();
+    virtual void scheduleMainEvent();
 
   public:
 

From 235746cdf270f617df2c556e3a676d7f4d02b355 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 17:33:02 -0800
Subject: [PATCH 058/247] Fixed cycle in hierarchy and config. Sim starts.

---
 configs/accl/sega.py                     |  40 ++--
 src/accl/graph/base/BaseEngine.py        |   2 +-
 src/accl/graph/base/base_apply_engine.hh |   5 +-
 src/accl/graph/base/base_engine.cc       |  12 ++
 src/accl/graph/base/base_engine.hh       |   5 +-
 src/accl/graph/base/base_push_engine.hh  |   2 +-
 src/accl/graph/base/base_wl_engine.hh    |   2 +-
 src/accl/graph/sega/MPU.py               |  42 -----
 src/accl/graph/sega/PushEngine.py        |   2 +-
 src/accl/graph/sega/SConscript           |   2 -
 src/accl/graph/sega/WLEngine.py          |   1 +
 src/accl/graph/sega/apply_engine.hh      |   3 +-
 src/accl/graph/sega/mpu.cc               | 223 -----------------------
 src/accl/graph/sega/mpu.hh               | 127 -------------
 src/accl/graph/sega/push_engine.cc       |  49 ++++-
 src/accl/graph/sega/push_engine.hh       |  27 ++-
 src/accl/graph/sega/wl_engine.cc         |  88 +++++++++
 src/accl/graph/sega/wl_engine.hh         |  34 +++-
 18 files changed, 238 insertions(+), 428 deletions(-)
 delete mode 100644 src/accl/graph/sega/MPU.py
 delete mode 100644 src/accl/graph/sega/mpu.cc
 delete mode 100644 src/accl/graph/sega/mpu.hh

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 288b1211e4..ea158ecdc9 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -1,28 +1,46 @@
 import m5
 from m5.objects import *
 
-class PyMPU(MPU):
-    def __init__(self, clk_domain):
-        super().__init__()
-        self.clk_domain = clk_domain
-        self.apply_engine = ApplyEngine()
+class MPU(SubSystem):
+    def __init__(self):
+        super(MPU, self).__init__()
         self.push_engine = PushEngine()
-        self.wl_engine = WLEngine()
+        self.apply_engine = ApplyEngine(push_engine = self.push_engine)
+        self.wl_engine = WLEngine(apply_engine = self.apply_engine)
+        self.interconnect = SystemXBar()
 
-class SEGA(System):
+        self.interconnect.cpu_side_ports = self.wl_engine.mem_port
+        self.interconnect.cpu_side_ports = self.apply_engine.mem_port
+        self.interconnect.cpu_side_ports = self.push_engine.mem_port
+
+    def getRespPort(self):
+        return self.wl_engine.resp_port
+    def setRespPort(self, port):
+        self.wl_engine.resp_port = port
+
+    def getReqPort(self):
+        return self.push_engine.req_port
+    def setReqPort(self, port):
+        self.push_engine.req_port = port
 
+    def getMemPort(self):
+        return self.interconnect.mem_side_ports
+    def setMemPort(self, port):
+        self.interconnect.mem_side_ports = port
+
+class SEGA(System):
     def __init__(self):
         super(SEGA, self).__init__()
-        # Set up the clock domain and the voltage domain
+
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '2GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
 
-        self.mpu = PyMPU(self.clk_domain)
+        self.mpu = MPU()
         self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB")))
-        self.mpu.memPort = self.mem_ctrl.port
-        self.mpu.reqPort = self.mpu.respPort
 
+        self.mpu.setReqPort(self.mpu.getRespPort())
+        self.mpu.setMemPort(self.mem_ctrl.port)
 
 system = SEGA()
 root = Root(full_system = False, system = system)
diff --git a/src/accl/graph/base/BaseEngine.py b/src/accl/graph/base/BaseEngine.py
index 367df8dbc1..16c2f402e5 100644
--- a/src/accl/graph/base/BaseEngine.py
+++ b/src/accl/graph/base/BaseEngine.py
@@ -36,4 +36,4 @@ class BaseEngine(ClockedObject):
     cxx_class = 'gem5::BaseEngine'
 
     system = Param.System(Parent.any, 'System this Engine is a part of')
-    memPort  = RequestPort("Port to communicate with the memory")
+    mem_port  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 486fb687fe..9111bd074b 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -56,16 +56,13 @@ class BaseApplyEngine : public BaseEngine
     virtual bool sendApplyNotif(uint32_t prop,
             uint32_t degree, uint32_t edgeIndex) = 0;
 
-    virtual void scheduleMainEvent();
+    virtual void scheduleMainEvent() override;
 
   public:
     PARAMS(BaseApplyEngine);
 
     BaseApplyEngine(const BaseApplyEngineParams &apply);
 
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
-
     bool recvWLNotif(Addr addr);
 };
 
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index 245192643c..6b40ba4137 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -38,6 +38,18 @@ BaseEngine::BaseEngine(const BaseEngineParams &params) :
     requestorId(system->getRequestorId(this))
 {}
 
+BaseEngine::~BaseEngine()
+{}
+
+Port&
+BaseEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "mem_port") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
 
 void
 BaseEngine::MemPort::sendPacket(PacketPtr pkt)
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
index 3436229aa1..53415ddc7c 100644
--- a/src/accl/graph/base/base_engine.hh
+++ b/src/accl/graph/base/base_engine.hh
@@ -32,6 +32,7 @@
 #include <queue>
 #include <unordered_map>
 
+#include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
 #include "params/BaseEngine.hh"
@@ -78,6 +79,8 @@ class BaseEngine : public ClockedObject
 
     bool memPortBlocked() { return memPort.blocked(); }
     void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
+    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+    AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); }
 
     virtual void scheduleMainEvent() = 0;
 
@@ -85,7 +88,7 @@ class BaseEngine : public ClockedObject
     PARAMS(BaseEngine);
 
     BaseEngine(const BaseEngineParams &params);
-
+    ~BaseEngine();
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 };
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 8bb7d6663a..01027d2791 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -66,7 +66,7 @@ class BasePushEngine : public BaseEngine
 
   protected:
     virtual bool sendPushUpdate(PacketPtr pkt) = 0;
-    virtual void scheduleMainEvent();
+    virtual void scheduleMainEvent() override;
 
   public:
 
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index a5070f0b26..38079f8f94 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -65,7 +65,7 @@ class BaseWLEngine : public BaseEngine
     */
   protected:
     virtual bool sendWLNotif(Addr addr) = 0;
-    virtual void scheduleMainEvent();
+    virtual void scheduleMainEvent() override;
 
   public:
 
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
deleted file mode 100644
index 87de0fb7d6..0000000000
--- a/src/accl/graph/sega/MPU.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
-from m5.objects.WLEngine import WLEngine
-
-class MPU(ClockedObject):
-    type = 'MPU'
-    cxx_header = "accl/graph/sega/mpu.hh"
-    cxx_class = 'gem5::MPU'
-
-    work_list_engine = Param.WLEngine(NULL, "WLEngine to connect to this MPU")
-
-    respPort = ResponsePort("Port to Receive updates from outside")
-    reqPort  = RequestPort("Port to send updates to the outside")
-    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index eb0eed18ab..a743b57262 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -34,4 +34,4 @@ class PushEngine(BasePushEngine):
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
-    mpu = Param.MPU(Parent.any, "MPU object that owns this PushEngine")
+    req_port  = RequestPort("Port to send updates to the outside")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index dc19ece06b..f20d0e44df 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -28,11 +28,9 @@
 Import('*')
 
 SimObject('ApplyEngine.py')
-SimObject('MPU.py')
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
 Source('apply_engine.cc')
-Source('mpu.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 3bfe9fa16f..2d650ecb92 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -34,5 +34,6 @@ class WLEngine(BaseWLEngine):
     cxx_header = "accl/graph/sega/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
+    resp_port = ResponsePort("Port to Receive updates from outside")
     apply_engine = Param.ApplyEngine(Parent.any,
             "MPU object that owns this WLEngine")
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index aff2c5417b..1190786e36 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -50,7 +50,8 @@ class ApplyEngine : public BaseApplyEngine
     PushEngine* pushEngine;
 
   protected:
-    virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+    virtual bool sendApplyNotif(uint32_t prop,
+        uint32_t degree, uint32_t edgeIndex) override;
 
   public:
     PARAMS(ApplyEngine);
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
deleted file mode 100644
index 9bda696cb5..0000000000
--- a/src/accl/graph/sega/mpu.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/mpu.hh"
-
-namespace gem5
-{
-
-MPU::MPU(const MPUParams &params):
-    ClockedObject(params),
-    respPort(name() + ".respPort", this),
-    reqPort(name() + ".reqPort", this),
-    memPort(name() + ".memPort", this),
-    wlEngine(params.work_list_engine)
-{}
-
-Port&
-MPU::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "respPort") {
-        return respPort;
-    } else if (if_name == "reqPort") {
-        return reqPort;
-    } else if (if_name == "memPort") {
-        return memPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
-}
-
-void
-MPU::startup()
-{
-    //FIXME: This is the current version of our initializer.
-    // This should be updated in the future.
-    WorkListItem vertices [5] = {
-                                {0, 0, 3, 0}, // Addr: 0
-                                {0, 0, 1, 3}, // Addr: 16
-                                {0, 0, 1, 4}, // Addr: 32
-                                {0, 0, 0, 5}, // Addr: 48
-                                {0, 0, 0, 5}  // Addr: 64
-                                };
-    Edge edges [6] = {
-                    {0, 16}, // Addr: 1048576
-                    {0, 32}, // Addr: 1048592
-                    {0, 48}, // Addr: 1048608
-                    {0, 32}, // Addr: 1048624
-                    {0, 64}  // Addr: 1048640
-                    };
-
-    for (int i = 0; i < 5; i++) {
-        uint8_t* data = workListToMemory(vertices[i]);
-        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
-                                        16, data, 0);
-        memPort.sendFunctional(pkt);
-    }
-
-    for (int i = 0; i < 6; i++) {
-        uint8_t* data = edgeToMemory(edges[i]);
-        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
-                                        16, data, 0);
-        memPort.sendFunctional(pkt);
-    }
-}
-
-AddrRangeList
-MPU::MPURespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-bool
-MPU::MPURespPort::recvTimingReq(PacketPtr pkt)
-{
-    return owner->handleWLUpdate(pkt);
-}
-
-Tick
-MPU::MPURespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-MPU::MPURespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-MPU::MPURespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-void
-MPU::MPUReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-MPU::MPUReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-MPU::MPUReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
-void
-MPU::MPUMemPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-MPU::MPUMemPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on MPU::MPUMemPort memPort.");
-}
-
-void
-MPU::MPUMemPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
-AddrRangeList
-MPU::getAddrRanges()
-{
-    return memPort.getAddrRanges();
-}
-
-void
-MPU::recvFunctional(PacketPtr pkt)
-{
-    if (pkt->cmd == MemCmd::UpdateWL) {
-        panic("Functional requests should not be made to WL.");
-        //TODO: Might be a good idea to implement later.
-        // wlEngine->recvFunctional(pkt);
-    } else {
-        memPort.sendFunctional(pkt);
-    }
-}
-
-bool
-MPU::handleWLUpdate(PacketPtr pkt)
-{
-    return wlEngine->handleWLUpdate(pkt);
-}
-
-bool
-MPU::recvPushUpdate(PacketPtr pkt)
-{
-    Addr addr = pkt->getAddr();
-    for (auto addr_range: memPort.getAddrRanges()) {
-        if (addr_range.contains(addr)) {
-            return handleWLUpdate(pkt);
-        }
-    }
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(pkt);
-        return true;
-    }
-    return true;
-}
-
-}
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
deleted file mode 100644
index a0472eead5..0000000000
--- a/src/accl/graph/sega/mpu.hh
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_MPU_HH__
-#define __ACCL_GRAPH_SEGA_MPU_HH__
-
-#include "accl/graph/sega/wl_engine.hh"
-#include "base/addr_range.hh"
-#include "mem/port.hh"
-#include "mem/packet.hh"
-#include "params/MPU.hh"
-#include "sim/clocked_object.hh"
-
-namespace gem5
-{
-
-class MPU : public ClockedObject
-{
-  private:
-    class MPURespPort : public ResponsePort
-    {
-      private:
-        MPU* owner;
-
-      public:
-        MPURespPort(const std::string& name, MPU* owner):
-          ResponsePort(name, owner), owner(owner)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    class MPUReqPort : public RequestPort
-    {
-      private:
-        MPU* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        MPUReqPort(const std::string& name, MPU* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    class MPUMemPort : public RequestPort
-    {
-      private:
-        MPU* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        MPUMemPort(const std::string& name, MPU* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    MPURespPort respPort;
-    MPUReqPort reqPort;
-    MPUMemPort memPort;
-
-    WLEngine* wlEngine;
-
-    virtual void startup();
-    AddrRangeList getAddrRanges();
-    void recvFunctional(PacketPtr pkt);
-
-  public:
-    PARAMS(MPU);
-    MPU(const MPUParams &params);
-
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-
-    bool handleWLUpdate(PacketPtr pkt);
-    bool recvPushUpdate(PacketPtr pkt);
-};
-
-}
-
-#endif // __ACCL_GRAPH_SEGA_MPU_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a1fa86da2b..c7b229ad33 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -33,13 +33,58 @@ namespace gem5
 
 PushEngine::PushEngine(const PushEngineParams &params) :
     BasePushEngine(params),
-    mpu(params.mpu)
+    reqPort(name() + "reqPort", this)
 {}
 
+Port&
+PushEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "req_port") {
+        return reqPort;
+    } else {
+        return BasePushEngine::getPort(if_name, idx);
+    }
+}
+
+void
+PushEngine::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+PushEngine::ReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
 bool
 PushEngine::sendPushUpdate(PacketPtr pkt)
 {
-    return mpu->recvPushUpdate(pkt);
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(pkt);
+        return true;
+    }
+    return false;
 }
 
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index edf698011d..604df4750d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -30,7 +30,6 @@
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
 #include "accl/graph/base/base_push_engine.hh"
-#include "accl/graph/sega/mpu.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
@@ -41,14 +40,36 @@ class MPU;
 class PushEngine : public BasePushEngine
 {
   private:
-    MPU* mpu;
+    class ReqPort : public RequestPort
+    {
+      private:
+        PushEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        ReqPort(const std::string& name, PushEngine* owner) :
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    ReqPort reqPort;
 
   protected:
-    virtual bool sendPushUpdate(PacketPtr pkt);
+    virtual bool sendPushUpdate(PacketPtr pkt) override;
 
   public:
     PARAMS(PushEngine);
     PushEngine(const PushEngineParams &params);
+    Port& getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 823aa49bb9..e565ac119b 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -33,12 +33,100 @@ namespace gem5
 
 WLEngine::WLEngine(const WLEngineParams &params):
     BaseWLEngine(params),
+    respPort(name() + ".respPort", this),
     applyEngine(params.apply_engine)
 {}
 
+Port&
+WLEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "resp_port") {
+        return respPort;
+    } else {
+        return BaseWLEngine::getPort(if_name, idx);
+    }
+}
+
+void
+WLEngine::startup()
+{
+    //FIXME: This is the current version of our initializer.
+    // This should be updated in the future.
+    WorkListItem vertices [5] = {
+                                {0, 0, 3, 0}, // Addr: 0
+                                {0, 0, 1, 3}, // Addr: 16
+                                {0, 0, 1, 4}, // Addr: 32
+                                {0, 0, 0, 5}, // Addr: 48
+                                {0, 0, 0, 5}  // Addr: 64
+                                };
+    Edge edges [6] = {
+                    {0, 16}, // Addr: 1048576
+                    {0, 32}, // Addr: 1048592
+                    {0, 48}, // Addr: 1048608
+                    {0, 32}, // Addr: 1048624
+                    {0, 64}  // Addr: 1048640
+                    };
+
+    for (int i = 0; i < 5; i++) {
+        uint8_t* data = workListToMemory(vertices[i]);
+        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
+                                        16, data, 0);
+        sendMemFunctional(pkt);
+    }
+
+    for (int i = 0; i < 6; i++) {
+        uint8_t* data = edgeToMemory(edges[i]);
+        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
+                                        16, data, 0);
+        sendMemFunctional(pkt);
+    }
+}
+
 bool
 WLEngine::sendWLNotif(Addr addr){
     return applyEngine->recvWLNotif(addr);
 }
 
+AddrRangeList
+WLEngine::RespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+bool
+WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
+{
+    return owner->handleWLUpdate(pkt);
+}
+
+Tick
+WLEngine::RespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+WLEngine::RespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+WLEngine::RespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+WLEngine::recvFunctional(PacketPtr pkt)
+{
+    if (pkt->cmd == MemCmd::UpdateWL) {
+        panic("Functional requests should not be made to WL.");
+        //TODO: Might be a good idea to implement later.
+        // wlEngine->recvFunctional(pkt);
+    } else {
+        sendMemFunctional(pkt);
+    }
+}
+
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 6946713aaa..f895a7ad32 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -34,14 +34,7 @@
 
 #include "accl/graph/base/base_wl_engine.hh"
 #include "accl/graph/sega/apply_engine.hh"
-#include "base/addr_range.hh"
-#include "mem/port.hh"
-#include "mem/packet.hh"
 #include "params/WLEngine.hh"
-#include "sim/clocked_object.hh"
-#include "sim/port.hh"
-#include "sim/system.hh"
-
 
 namespace gem5
 {
@@ -51,14 +44,39 @@ class ApplyEngine;
 class WLEngine : public BaseWLEngine
 {
   private:
+    class RespPort : public ResponsePort
+    {
+      private:
+        WLEngine* owner;
+
+      public:
+        RespPort(const std::string& name, WLEngine* owner):
+          ResponsePort(name, owner), owner(owner)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    RespPort respPort;
     ApplyEngine* applyEngine;
 
+
+    virtual void startup();
+    void recvFunctional(PacketPtr pkt);
+
   protected:
-    virtual bool sendWLNotif(Addr addr);
+    virtual bool sendWLNotif(Addr addr) override;
 
   public:
     PARAMS(WLEngine);
     WLEngine(const WLEngineParams &params);
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
 };
 
 }

From d66efdf5a3e2e2fc4d425ad2f80ab22da10a19a5 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 18:23:54 -0800
Subject: [PATCH 059/247] Started fixing memory leak.

---
 src/accl/graph/base/base_apply_engine.cc | 6 +++---
 src/accl/graph/base/base_push_engine.cc  | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index e7b7dd6a22..7b643969df 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -61,9 +61,9 @@ BaseApplyEngine::processNextApplyCheckEvent()
     Addr addr = applyReadQueue.front();
     Addr req_addr = (addr / 64) * 64;
     Addr req_offset = (addr % 64);
-    RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
-    PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
-    requestOffset[request] = req_offset;
+
+    PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
+    requestOffset[memPkt->req] = req_offset;
     if (!memPortBlocked()) {
         sendMemReq(memPkt);
         applyReadQueue.pop();
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index a963cc9709..6e5aa05779 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -112,7 +112,8 @@ BasePushEngine::processNextPushEvent()
     for (int i = 0; i < num_edges; i++) {
         uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
         Edge e = memoryToEdge(curr_edge_data);
-        uint32_t *update_data = new uint32_t;
+        int data_size = sizeof(uint32_t) / sizeof(uint8_t);
+        uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
 
         // TODO: Implement propagate function here
         *update_data = value + 1;

From df1340a91e5262a0d97faed7ffd39bf1e62af840 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 18:35:05 -0800
Subject: [PATCH 060/247] Adding newlines.

---
 configs/accl/sega.py                | 2 +-
 src/accl/graph/sega/apply_engine.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index ea158ecdc9..54970d356e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -49,4 +49,4 @@ def __init__(self):
 
 exit_event = m5.simulate()
 print("Simulation finished!")
-exit()
\ No newline at end of file
+exit()
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index bc3d703cf6..5d5f8daf26 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -42,4 +42,4 @@ ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
 
 }
 
-}
\ No newline at end of file
+}

From ef0f9669a303035981a9ffc298b4acdf275d1ffc Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Thu, 24 Feb 2022 11:43:26 -0800
Subject: [PATCH 061/247] Removed the UpdateWL from the MemCmd.

---
 src/accl/graph/base/util.cc      |  3 ++-
 src/accl/graph/sega/wl_engine.cc | 13 +++++++------
 src/mem/packet.hh                |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/accl/graph/base/util.cc b/src/accl/graph/base/util.cc
index 0baa374714..4172607ed0 100644
--- a/src/accl/graph/base/util.cc
+++ b/src/accl/graph/base/util.cc
@@ -133,7 +133,8 @@ getUpdatePacket(Addr addr, unsigned int size,
     // bits
     req->setPC(((Addr)requestorId) << 2);
 
-    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
+    // FIXME: MemCmd::UpdateWL
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
 
     pkt->allocate();
     pkt->setData(data);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index e565ac119b..f3c63e71f3 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -120,13 +120,14 @@ WLEngine::RespPort::recvRespRetry()
 void
 WLEngine::recvFunctional(PacketPtr pkt)
 {
-    if (pkt->cmd == MemCmd::UpdateWL) {
-        panic("Functional requests should not be made to WL.");
-        //TODO: Might be a good idea to implement later.
-        // wlEngine->recvFunctional(pkt);
-    } else {
+    // FIXME: This needs to be fixed
+    // if (pkt->cmd == MemCmd::UpdateWL) {
+    //     panic("Functional requests should not be made to WL.");
+    //     //TODO: Might be a good idea to implement later.
+    //     // wlEngine->recvFunctional(pkt);
+    // } else {
         sendMemFunctional(pkt);
-    }
+    // }
 }
 
 }
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 5332ee32a2..a67abbbbaa 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -149,7 +149,7 @@ class MemCmd
         // Tlb shootdown
         TlbiExtSync,
         // MPU Accelerator
-        UpdateWL,
+        // UpdateWL,
         NUM_MEM_CMDS
     };
 

From acfffa3e25a866c6dc3aaa844ac195e530a44096 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 25 Feb 2022 11:49:51 -0800
Subject: [PATCH 062/247] Adding initial update. Fixing some bugs.

---
 src/accl/graph/base/base_wl_engine.cc | 2 +-
 src/accl/graph/sega/wl_engine.cc      | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index ef66603de7..1b9d92c1b4 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -62,7 +62,7 @@ void BaseWLEngine::processNextWLReadEvent()
     requestOffsetMap[memPkt->req] = req_offset;
     requestValueMap[memPkt->req] = value;
 
-    if (memPortBlocked()) {
+    if (!memPortBlocked()) {
         sendMemReq(memPkt);
         updateQueue.pop();
     }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index f3c63e71f3..61bee38c05 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -80,6 +80,15 @@ WLEngine::startup()
                                         16, data, 0);
         sendMemFunctional(pkt);
     }
+
+    uint8_t* first_update_data = new uint8_t [4];
+    uint32_t* tempPtr = (uint32_t*) first_update_data;
+    *tempPtr = 0;
+
+    PacketPtr first_update = getUpdatePacket(
+        0, 4, first_update_data, requestorId);
+
+    handleWLUpdate(first_update);
 }
 
 bool

From 75825c3de944037f32c8b21d73106bcac77cbb00 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 25 Feb 2022 13:35:24 -0800
Subject: [PATCH 063/247] Adding few debugging flags.

---
 src/accl/graph/base/SConscript           | 2 ++
 src/accl/graph/base/base_apply_engine.cc | 7 +++++++
 src/accl/graph/base/base_push_engine.cc  | 5 ++++-
 src/accl/graph/base/base_wl_engine.cc    | 6 ++++++
 src/accl/graph/sega/wl_engine.cc         | 2 +-
 5 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 41c48fc419..c5c8c4e901 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -37,3 +37,5 @@ Source('base_engine.cc')
 Source('base_push_engine.cc')
 Source('base_wl_engine.cc')
 Source('util.cc')
+
+DebugFlag('MPU')
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 7b643969df..5eb9d90059 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -31,6 +31,8 @@
 #include <string>
 
 #include "accl/graph/base/util.hh"
+#include "debug/MPU.hh"
+
 
 namespace gem5
 {
@@ -83,6 +85,8 @@ BaseApplyEngine::processNextApplyEvent()
     Addr request_offset = requestOffset[request];
 
     WorkListItem wl = memoryToWorkList(data + request_offset);
+    DPRINTF(MPU, "%s: Apply Engine is reading WorkList Item: %s\n"
+                , __func__, wl.to_string());
     // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem
     // to applyengine if temp_prop < prop. If temp_prop has not changed, why
     // fwd it to applyengine?
@@ -101,6 +105,9 @@ BaseApplyEngine::processNextApplyEvent()
             if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
                 sendMemReq(writePkt);
                 memRespQueue.pop();
+                DPRINTF(MPU, "%s: The Apply Engine is applying the new value",
+                              "into WorkList Item: %s\n"
+                              , __func__, wl.to_string());
             }
         }
     } else {
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index 6e5aa05779..f46941b8ed 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/base/base_push_engine.hh"
 
 #include "accl/graph/base/util.hh"
+#include "debug/MPU.hh"
 
 namespace gem5
 {
@@ -47,6 +48,7 @@ BasePushEngine::recvApplyNotif(uint32_t prop,
     if (!nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());
     }
+    DPRINTF(MPU, "%s: Reading %d edges.", __func__, degree);
     return true;
 }
 
@@ -114,7 +116,6 @@ BasePushEngine::processNextPushEvent()
         Edge e = memoryToEdge(curr_edge_data);
         int data_size = sizeof(uint32_t) / sizeof(uint8_t);
         uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
-
         // TODO: Implement propagate function here
         *update_data = value + 1;
         PacketPtr update = getUpdatePacket(e.neighbor,
@@ -122,6 +123,8 @@ BasePushEngine::processNextPushEvent()
             requestorId);
         if (sendPushUpdate(update)) {
             memRespQueue.pop();
+            DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
+                , __func__, e.to_string(), *update_data);
             // TODO: Erase map entries here.
         }
     }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 1b9d92c1b4..38ebf0f35b 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -27,6 +27,7 @@
  */
 
 #include "accl/graph/base/base_wl_engine.hh"
+#include "debug/MPU.hh"
 
 #include <string>
 
@@ -80,6 +81,8 @@ BaseWLEngine::processNextWLReduceEvent()
     uint32_t value = requestValueMap[resp->req];
     WorkListItem wl =  memoryToWorkList(respData + request_offset);
 
+    DPRINTF(MPU, "%s: The WLE is reading WorkList item: %s\n"
+                , __func__, wl.to_string());
     if (value < wl.temp_prop){
         //update prop with temp_prop
         wl.temp_prop = value;
@@ -89,10 +92,13 @@ BaseWLEngine::processNextWLReduceEvent()
         PacketPtr writePkt  =
         getWritePacket(resp->getAddr(), 64, respData, requestorId);
 
+
         if (!memPortBlocked()) {
             if (sendWLNotif(resp->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
                 memRespQueue.pop();
+                DPRINTF(MPU, "%s: The WLE is chanching to: %s\n"
+                , __func__, wl.to_string());
                 // TODO: Erase map entries, delete wlData;
             }
         }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 61bee38c05..674004d7a5 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -27,7 +27,7 @@
  */
 
 #include "accl/graph/sega/wl_engine.hh"
-
+#include "debug/MPU.hh"
 namespace gem5
 {
 

From d3f342cab70cc838b254365789afe4947d6677bc Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 28 Feb 2022 15:04:53 -0800
Subject: [PATCH 064/247] Adding lock_dir.

---
 configs/accl/sega.py                     |  6 +-
 src/accl/graph/base/base_apply_engine.cc | 19 ++++---
 src/accl/graph/base/base_apply_engine.hh |  3 +-
 src/accl/graph/base/base_wl_engine.cc    | 23 +++++---
 src/accl/graph/base/base_wl_engine.hh    |  2 +
 src/accl/graph/sega/ApplyEngine.py       |  1 +
 src/accl/graph/sega/LockDir.py           | 46 +++++++++++++++
 src/accl/graph/sega/SConscript           |  2 +
 src/accl/graph/sega/WLEngine.py          |  1 +
 src/accl/graph/sega/apply_engine.cc      | 15 ++++-
 src/accl/graph/sega/apply_engine.hh      |  4 ++
 src/accl/graph/sega/lock_dir.cc          | 71 ++++++++++++++++++++++++
 src/accl/graph/sega/lock_dir.hh          | 57 +++++++++++++++++++
 src/accl/graph/sega/wl_engine.cc         | 15 ++++-
 src/accl/graph/sega/wl_engine.hh         |  5 +-
 15 files changed, 248 insertions(+), 22 deletions(-)
 create mode 100644 src/accl/graph/sega/LockDir.py
 create mode 100644 src/accl/graph/sega/lock_dir.cc
 create mode 100644 src/accl/graph/sega/lock_dir.hh

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 54970d356e..db0bf4678f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -4,11 +4,13 @@
 class MPU(SubSystem):
     def __init__(self):
         super(MPU, self).__init__()
+        self.lock_dir = LockDirectory()
         self.push_engine = PushEngine()
-        self.apply_engine = ApplyEngine(push_engine = self.push_engine)
-        self.wl_engine = WLEngine(apply_engine = self.apply_engine)
+        self.apply_engine = ApplyEngine(push_engine = self.push_engine, lock_dir = self.lock_dir)
+        self.wl_engine = WLEngine(apply_engine = self.apply_engine, lock_dir = self.lock_dir)
         self.interconnect = SystemXBar()
 
+
         self.interconnect.cpu_side_ports = self.wl_engine.mem_port
         self.interconnect.cpu_side_ports = self.apply_engine.mem_port
         self.interconnect.cpu_side_ports = self.push_engine.mem_port
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 5eb9d90059..890d5dd313 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -61,14 +61,16 @@ BaseApplyEngine::processNextApplyCheckEvent()
     // pops items off queue, maybe we should pop every n cycles
     // or change the clock domain for this simobject.
     Addr addr = applyReadQueue.front();
-    Addr req_addr = (addr / 64) * 64;
-    Addr req_offset = (addr % 64);
+    if (acquireAddress(addr)) {
+        Addr req_addr = (addr / 64) * 64;
+        Addr req_offset = (addr % 64);
 
-    PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
-    requestOffset[memPkt->req] = req_offset;
-    if (!memPortBlocked()) {
-        sendMemReq(memPkt);
-        applyReadQueue.pop();
+        PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
+        requestOffset[memPkt->req] = req_offset;
+        if (!memPortBlocked()) {
+            sendMemReq(memPkt);
+            applyReadQueue.pop();
+        }
     }
     if (!applyReadQueue.empty() &&  !nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
@@ -113,6 +115,9 @@ BaseApplyEngine::processNextApplyEvent()
     } else {
         memRespQueue.pop();
     }
+    if (!releaseAddress(pkt->getAddr())) {
+        panic("Could not release an address");
+    }
     if (!nextApplyEvent.scheduled() && !memRespQueue.empty()){
         schedule(nextApplyEvent, nextCycle());
     }
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 9111bd074b..f4df298079 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -55,7 +55,8 @@ class BaseApplyEngine : public BaseEngine
   protected:
     virtual bool sendApplyNotif(uint32_t prop,
             uint32_t degree, uint32_t edgeIndex) = 0;
-
+    virtual bool acquireAddress(Addr addr) = 0;
+    virtual bool releaseAddress(Addr addr) = 0;
     virtual void scheduleMainEvent() override;
 
   public:
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 38ebf0f35b..7f1a27aae5 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -56,16 +56,18 @@ void BaseWLEngine::processNextWLReadEvent()
     uint32_t value = *(pkt->getPtr<uint32_t>());
 
     Addr addr = pkt->getAddr();
-    Addr req_addr = (addr / 64) * 64;
-    Addr req_offset = addr % 64;
+    if (acquireAddress(addr)) {
+        Addr req_addr = (addr / 64) * 64;
+        Addr req_offset = addr % 64;
 
-    PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
-    requestOffsetMap[memPkt->req] = req_offset;
-    requestValueMap[memPkt->req] = value;
+        PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
+        requestOffsetMap[memPkt->req] = req_offset;
+        requestValueMap[memPkt->req] = value;
 
-    if (!memPortBlocked()) {
-        sendMemReq(memPkt);
-        updateQueue.pop();
+        if (!memPortBlocked()) {
+            sendMemReq(memPkt);
+            updateQueue.pop();
+        }
     }
     if (!nextWLReadEvent.scheduled() && !updateQueue.empty()) {
         schedule(nextWLReadEvent, nextCycle());
@@ -92,7 +94,6 @@ BaseWLEngine::processNextWLReduceEvent()
         PacketPtr writePkt  =
         getWritePacket(resp->getAddr(), 64, respData, requestorId);
 
-
         if (!memPortBlocked()) {
             if (sendWLNotif(resp->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
@@ -106,6 +107,10 @@ BaseWLEngine::processNextWLReduceEvent()
     else {
         memRespQueue.pop();
     }
+    if (!releaseAddress(resp->getAddr())) {
+        panic("Could not release an address");
+    }
+    std::cout << "success" << std::endl;
     if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){
             schedule(nextWLReduceEvent, nextCycle());
     }
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 38079f8f94..15371f965b 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -65,6 +65,8 @@ class BaseWLEngine : public BaseEngine
     */
   protected:
     virtual bool sendWLNotif(Addr addr) = 0;
+    virtual bool acquireAddress(Addr addr) = 0;
+    virtual bool releaseAddress(Addr addr) = 0;
     virtual void scheduleMainEvent() override;
 
   public:
diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/ApplyEngine.py
index 5bb0dc0c25..7a446bb620 100644
--- a/src/accl/graph/sega/ApplyEngine.py
+++ b/src/accl/graph/sega/ApplyEngine.py
@@ -35,3 +35,4 @@ class ApplyEngine(BaseApplyEngine):
     cxx_class = 'gem5::ApplyEngine'
 
     push_engine = Param.PushEngine(Parent.any, "MPU object that owns this ApplyEngine")
+    lock_dir = Param.LockDirectory(NULL, "The lock directory to acquire locks from")
diff --git a/src/accl/graph/sega/LockDir.py b/src/accl/graph/sega/LockDir.py
new file mode 100644
index 0000000000..d21963dc3a
--- /dev/null
+++ b/src/accl/graph/sega/LockDir.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2012-2014, 2017-2018 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Copyright (c) 2007 The Regents of The University of Michigan
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.SimObject import SimObject
+
+class LockDirectory(SimObject):
+    type = 'LockDirectory'
+    cxx_header = 'accl/graph/sega/lock_dir.hh'
+    cxx_class = 'gem5::LockDirectory'
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index f20d0e44df..e6d2f1fbbc 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -28,9 +28,11 @@
 Import('*')
 
 SimObject('ApplyEngine.py')
+SimObject('LockDir.py')
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
 Source('apply_engine.cc')
+Source('lock_dir.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 2d650ecb92..b6e697266e 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -37,3 +37,4 @@ class WLEngine(BaseWLEngine):
     resp_port = ResponsePort("Port to Receive updates from outside")
     apply_engine = Param.ApplyEngine(Parent.any,
             "MPU object that owns this WLEngine")
+    lock_dir = Param.LockDirectory(NULL, "The lock directory to acquire locks from")
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index 5d5f8daf26..544bb082ad 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -32,7 +32,8 @@ namespace gem5{
 
 ApplyEngine::ApplyEngine(const ApplyEngineParams &params) :
     BaseApplyEngine(params),
-    pushEngine(params.push_engine)
+    pushEngine(params.push_engine),
+    lockDir(params.lock_dir)
 {}
 
 bool
@@ -42,4 +43,16 @@ ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
 
 }
 
+bool
+ApplyEngine::acquireAddress(Addr addr)
+{
+    return lockDir->acquire(addr, requestorId);
+}
+
+bool
+ApplyEngine::releaseAddress(Addr addr)
+{
+    return lockDir->release(addr, requestorId);
+}
+
 }
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index 1190786e36..c88330487a 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -33,6 +33,7 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_apply_engine.hh"
+#include "accl/graph/sega/lock_dir.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
@@ -48,10 +49,13 @@ class ApplyEngine : public BaseApplyEngine
 {
   private:
     PushEngine* pushEngine;
+    LockDirectory* lockDir;
 
   protected:
     virtual bool sendApplyNotif(uint32_t prop,
         uint32_t degree, uint32_t edgeIndex) override;
+    virtual bool acquireAddress(Addr addr) override;
+    virtual bool releaseAddress(Addr addr) override;
 
   public:
     PARAMS(ApplyEngine);
diff --git a/src/accl/graph/sega/lock_dir.cc b/src/accl/graph/sega/lock_dir.cc
new file mode 100644
index 0000000000..b7efa638fe
--- /dev/null
+++ b/src/accl/graph/sega/lock_dir.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/lock_dir.hh"
+
+namespace gem5
+{
+
+LockDirectory::LockDirectory(const LockDirectoryParams &params) :
+    SimObject(params)
+{}
+
+bool
+LockDirectory::acquire(Addr addr, RequestorID requestorId)
+{
+    if (lockOwnerMap.find(addr) == lockOwnerMap.end()) {
+        lockOwnerMap[addr] = requestorId;
+        lockDegreeMap[addr] = 1;
+        return true;
+    } else if (lockOwnerMap[addr] == requestorId) {
+        lockDegreeMap[addr] = lockDegreeMap[addr] + 1;
+        return true;
+    } else {
+        return false;
+    }
+}
+
+bool
+LockDirectory::release(Addr addr, RequestorID requestorId)
+{
+    if (lockOwnerMap.find(addr) == lockOwnerMap.end()) {
+        panic("Should not relase an address before acquiring");
+    } else if (lockOwnerMap[addr] != requestorId) {
+        panic("Should not release and address you don't own");
+    } else {
+        lockDegreeMap[addr] = lockDegreeMap[addr] - 1;
+        if (lockDegreeMap[addr] == 0) {
+            lockDegreeMap.erase(addr);
+            lockOwnerMap.erase(addr);
+            return true;
+        }
+    }
+    return false;
+}
+
+}
diff --git a/src/accl/graph/sega/lock_dir.hh b/src/accl/graph/sega/lock_dir.hh
new file mode 100644
index 0000000000..64d934d42f
--- /dev/null
+++ b/src/accl/graph/sega/lock_dir.hh
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_LOCK_DIR_HH__
+#define __ACCL_GRAPH_SEGA_LOCK_DIR_HH__
+
+#include <unordered_map>
+
+#include "mem/packet.hh"
+#include "params/LockDirectory.hh"
+#include "sim/sim_object.hh"
+
+namespace gem5
+{
+
+class LockDirectory: public SimObject
+{
+  private:
+    std::unordered_map<Addr, RequestorID> lockOwnerMap;
+    std::unordered_map<Addr, int> lockDegreeMap;
+
+  public:
+    PARAMS(LockDirectory);
+    LockDirectory(const LockDirectoryParams &params);
+
+    bool acquire(Addr addr, RequestorID requestorId);
+    bool release(Addr addr, RequestorID requestorId);
+};
+
+}
+
+#endif
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 674004d7a5..e557a08c18 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -34,7 +34,8 @@ namespace gem5
 WLEngine::WLEngine(const WLEngineParams &params):
     BaseWLEngine(params),
     respPort(name() + ".respPort", this),
-    applyEngine(params.apply_engine)
+    applyEngine(params.apply_engine),
+    lockDir(params.lock_dir)
 {}
 
 Port&
@@ -139,4 +140,16 @@ WLEngine::recvFunctional(PacketPtr pkt)
     // }
 }
 
+bool
+WLEngine::acquireAddress(Addr addr)
+{
+    return lockDir->acquire(addr, requestorId);
+}
+
+bool
+WLEngine::releaseAddress(Addr addr)
+{
+    return lockDir->release(addr, requestorId);
+}
+
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index f895a7ad32..4e8a25795a 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -34,6 +34,7 @@
 
 #include "accl/graph/base/base_wl_engine.hh"
 #include "accl/graph/sega/apply_engine.hh"
+#include "accl/graph/sega/lock_dir.hh"
 #include "params/WLEngine.hh"
 
 namespace gem5
@@ -64,13 +65,15 @@ class WLEngine : public BaseWLEngine
 
     RespPort respPort;
     ApplyEngine* applyEngine;
-
+    LockDirectory* lockDir;
 
     virtual void startup();
     void recvFunctional(PacketPtr pkt);
 
   protected:
     virtual bool sendWLNotif(Addr addr) override;
+    virtual bool acquireAddress(Addr addr) override;
+    virtual bool releaseAddress(Addr addr) override;
 
   public:
     PARAMS(WLEngine);

From eb63831b87d00aed4447daaa7855fd5641e6de3f Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 28 Feb 2022 15:42:20 -0800
Subject: [PATCH 065/247] Debugging

---
 src/accl/graph/base/base_wl_engine.cc |  6 +++---
 src/accl/graph/sega/wl_engine.cc      | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 7f1a27aae5..f5d739da2d 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -83,8 +83,8 @@ BaseWLEngine::processNextWLReduceEvent()
     uint32_t value = requestValueMap[resp->req];
     WorkListItem wl =  memoryToWorkList(respData + request_offset);
 
-    DPRINTF(MPU, "%s: The WLE is reading WorkList item: %s\n"
-                , __func__, wl.to_string());
+    DPRINTF(MPU, "%s: The WLE is reading WorkList item: %s %d\n"
+                , __func__, wl.to_string(), value);
     if (value < wl.temp_prop){
         //update prop with temp_prop
         wl.temp_prop = value;
@@ -110,7 +110,7 @@ BaseWLEngine::processNextWLReduceEvent()
     if (!releaseAddress(resp->getAddr())) {
         panic("Could not release an address");
     }
-    std::cout << "success" << std::endl;
+    std::cout << "success "<<  memRespQueue.size() << std::endl;
     if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){
             schedule(nextWLReduceEvent, nextCycle());
     }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index e557a08c18..a84ed2d52f 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -54,11 +54,11 @@ WLEngine::startup()
     //FIXME: This is the current version of our initializer.
     // This should be updated in the future.
     WorkListItem vertices [5] = {
-                                {0, 0, 3, 0}, // Addr: 0
-                                {0, 0, 1, 3}, // Addr: 16
-                                {0, 0, 1, 4}, // Addr: 32
-                                {0, 0, 0, 5}, // Addr: 48
-                                {0, 0, 0, 5}  // Addr: 64
+                                {1000, 1000, 3, 0}, // Addr: 0
+                                {1000, 1000, 1, 3}, // Addr: 16
+                                {1000, 1000, 1, 4}, // Addr: 32
+                                {10000, 1000, 0, 5}, // Addr: 48
+                                {10000, 10000, 0, 5}  // Addr: 64
                                 };
     Edge edges [6] = {
                     {0, 16}, // Addr: 1048576

From 4d137d8c5389fb4dd28d4ca6a7e49df1184b9d9b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 28 Feb 2022 15:53:38 -0800
Subject: [PATCH 066/247] More debugging.

---
 configs/accl/sega.py               |  3 ++-
 src/accl/graph/base/base_engine.cc |  3 +++
 src/accl/graph/sega/lock_dir.cc    | 12 ++----------
 src/accl/graph/sega/lock_dir.hh    |  2 +-
 4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index db0bf4678f..db5a36b987 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -39,7 +39,8 @@ def __init__(self):
         self.clk_domain.voltage_domain = VoltageDomain()
 
         self.mpu = MPU()
-        self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB")))
+        self.mem_ctrl = SimpleMemory(range=AddrRange("4GiB"), bandwidth="1000GB/s", latency = "30ns")
+        # self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB")))
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setMemPort(self.mem_ctrl.port)
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index 6b40ba4137..f449e6ffdb 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -87,6 +87,9 @@ BaseEngine::MemPort::recvReqRetry()
 bool
 BaseEngine::handleMemResp(PacketPtr pkt)
 {
+    if (pkt->isResponse() && pkt->isWrite()) {
+        return true;
+    }
     memRespQueue.push(pkt);
     scheduleMainEvent();
     return true;
diff --git a/src/accl/graph/sega/lock_dir.cc b/src/accl/graph/sega/lock_dir.cc
index b7efa638fe..6a4496175d 100644
--- a/src/accl/graph/sega/lock_dir.cc
+++ b/src/accl/graph/sega/lock_dir.cc
@@ -40,10 +40,6 @@ LockDirectory::acquire(Addr addr, RequestorID requestorId)
 {
     if (lockOwnerMap.find(addr) == lockOwnerMap.end()) {
         lockOwnerMap[addr] = requestorId;
-        lockDegreeMap[addr] = 1;
-        return true;
-    } else if (lockOwnerMap[addr] == requestorId) {
-        lockDegreeMap[addr] = lockDegreeMap[addr] + 1;
         return true;
     } else {
         return false;
@@ -58,12 +54,8 @@ LockDirectory::release(Addr addr, RequestorID requestorId)
     } else if (lockOwnerMap[addr] != requestorId) {
         panic("Should not release and address you don't own");
     } else {
-        lockDegreeMap[addr] = lockDegreeMap[addr] - 1;
-        if (lockDegreeMap[addr] == 0) {
-            lockDegreeMap.erase(addr);
-            lockOwnerMap.erase(addr);
-            return true;
-        }
+        lockOwnerMap.erase(addr);
+        return true;
     }
     return false;
 }
diff --git a/src/accl/graph/sega/lock_dir.hh b/src/accl/graph/sega/lock_dir.hh
index 64d934d42f..012334ce43 100644
--- a/src/accl/graph/sega/lock_dir.hh
+++ b/src/accl/graph/sega/lock_dir.hh
@@ -42,7 +42,7 @@ class LockDirectory: public SimObject
 {
   private:
     std::unordered_map<Addr, RequestorID> lockOwnerMap;
-    std::unordered_map<Addr, int> lockDegreeMap;
+    // std::unordered_map<Addr, int> lockDegreeMap;
 
   public:
     PARAMS(LockDirectory);

From efcbae85fd36cae6477f1aa66b802f078ef87e2f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 28 Feb 2022 16:34:09 -0800
Subject: [PATCH 067/247] Fixed the bugs. Simulation is an endless loop.

---
 configs/accl/sega.py                     |  2 +-
 src/accl/graph/base/base_apply_engine.cc |  7 +++----
 src/accl/graph/base/base_engine.cc       |  6 ++++--
 src/accl/graph/base/base_push_engine.cc  |  2 +-
 src/accl/graph/base/base_wl_engine.cc    | 10 ++++------
 5 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index db5a36b987..163ea169d9 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -50,6 +50,6 @@ def __init__(self):
 
 m5.instantiate()
 
-exit_event = m5.simulate()
+exit_event = m5.simulate(1000000)
 print("Simulation finished!")
 exit()
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 890d5dd313..e222cb5a76 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -61,10 +61,9 @@ BaseApplyEngine::processNextApplyCheckEvent()
     // pops items off queue, maybe we should pop every n cycles
     // or change the clock domain for this simobject.
     Addr addr = applyReadQueue.front();
-    if (acquireAddress(addr)) {
-        Addr req_addr = (addr / 64) * 64;
-        Addr req_offset = (addr % 64);
-
+    Addr req_addr = (addr / 64) * 64;
+    Addr req_offset = (addr % 64);
+    if (acquireAddress(req_addr)) {
         PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
         requestOffset[memPkt->req] = req_offset;
         if (!memPortBlocked()) {
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index f449e6ffdb..ad87bb3662 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -27,7 +27,7 @@
  */
 
 #include "accl/graph/base/base_engine.hh"
-
+#include "debug/MPU.hh"
 namespace gem5
 {
 
@@ -36,7 +36,9 @@ BaseEngine::BaseEngine(const BaseEngineParams &params) :
     system(params.system),
     memPort(name() + ".memPort", this),
     requestorId(system->getRequestorId(this))
-{}
+{
+    DPRINTF(MPU, "%s: My requestorId is %u,\n", __func__, requestorId);
+}
 
 BaseEngine::~BaseEngine()
 {}
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index f46941b8ed..4ebe40e486 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -121,7 +121,7 @@ BasePushEngine::processNextPushEvent()
         PacketPtr update = getUpdatePacket(e.neighbor,
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
             requestorId);
-        if (sendPushUpdate(update)) {
+        if (sendPushUpdate(update) && (i == num_edges - 1)) {
             memRespQueue.pop();
             DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
                 , __func__, e.to_string(), *update_data);
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index f5d739da2d..921e9c683d 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -56,10 +56,9 @@ void BaseWLEngine::processNextWLReadEvent()
     uint32_t value = *(pkt->getPtr<uint32_t>());
 
     Addr addr = pkt->getAddr();
-    if (acquireAddress(addr)) {
-        Addr req_addr = (addr / 64) * 64;
-        Addr req_offset = addr % 64;
-
+    Addr req_addr = (addr / 64) * 64;
+    Addr req_offset = addr % 64;
+    if (acquireAddress(req_addr)) {
         PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
         requestOffsetMap[memPkt->req] = req_offset;
         requestValueMap[memPkt->req] = value;
@@ -98,7 +97,7 @@ BaseWLEngine::processNextWLReduceEvent()
             if (sendWLNotif(resp->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
                 memRespQueue.pop();
-                DPRINTF(MPU, "%s: The WLE is chanching to: %s\n"
+                DPRINTF(MPU, "%s: The WLE is changing to: %s\n"
                 , __func__, wl.to_string());
                 // TODO: Erase map entries, delete wlData;
             }
@@ -110,7 +109,6 @@ BaseWLEngine::processNextWLReduceEvent()
     if (!releaseAddress(resp->getAddr())) {
         panic("Could not release an address");
     }
-    std::cout << "success "<<  memRespQueue.size() << std::endl;
     if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){
             schedule(nextWLReduceEvent, nextCycle());
     }

From f0dadbb9eea953ca1b69cca3e7bbc3dd994d87e3 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 28 Feb 2022 18:34:18 -0800
Subject: [PATCH 068/247] Debugged: Releases the address when the memory is
 blocked. Added debugging flgs for validation.

---
 src/accl/graph/base/base_apply_engine.cc | 14 ++++++---
 src/accl/graph/base/base_wl_engine.cc    | 12 ++++++--
 src/accl/graph/sega/wl_engine.cc         | 17 ++++++-----
 src/mem/packet.cc                        | 39 ++++++++++++++++++++++++
 src/mem/packet.hh                        |  2 ++
 5 files changed, 69 insertions(+), 15 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index e222cb5a76..39f5dafc67 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -86,8 +86,8 @@ BaseApplyEngine::processNextApplyEvent()
     Addr request_offset = requestOffset[request];
 
     WorkListItem wl = memoryToWorkList(data + request_offset);
-    DPRINTF(MPU, "%s: Apply Engine is reading WorkList Item: %s\n"
-                , __func__, wl.to_string());
+    DPRINTF(MPU, "%s: Apply Engine is reading WorkList Item[%lu]: %s\n"
+                , __func__, pkt->getAddr() + request_offset, wl.to_string());
     // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem
     // to applyengine if temp_prop < prop. If temp_prop has not changed, why
     // fwd it to applyengine?
@@ -102,13 +102,17 @@ BaseApplyEngine::processNextApplyEvent()
         PacketPtr writePkt  =
         getWritePacket(pkt->getAddr(), 64, data, requestorId);
 
+        DPRINTF(MPU, "%s: Sending a pkt with this info. "
+                "pkt->addr: %lu, pkt->size: %lu\npkt->data: %s\n",
+                __func__, writePkt->getAddr(),
+                writePkt->getSize(), writePkt->printData());
+
         if (!memPortBlocked()) {
             if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
                 sendMemReq(writePkt);
                 memRespQueue.pop();
-                DPRINTF(MPU, "%s: The Apply Engine is applying the new value",
-                              "into WorkList Item: %s\n"
-                              , __func__, wl.to_string());
+                DPRINTF(MPU, "%s: The Apply Engine is applying the new value into WorkList Item[%lu]: %s\n"
+                              , __func__, pkt->getAddr() + request_offset, wl.to_string());
             }
         }
     } else {
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 921e9c683d..fd45b85077 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -58,6 +58,7 @@ void BaseWLEngine::processNextWLReadEvent()
     Addr addr = pkt->getAddr();
     Addr req_addr = (addr / 64) * 64;
     Addr req_offset = addr % 64;
+
     if (acquireAddress(req_addr)) {
         PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
         requestOffsetMap[memPkt->req] = req_offset;
@@ -67,6 +68,9 @@ void BaseWLEngine::processNextWLReadEvent()
             sendMemReq(memPkt);
             updateQueue.pop();
         }
+        else{
+            releaseAddress(req_addr);
+        }
     }
     if (!nextWLReadEvent.scheduled() && !updateQueue.empty()) {
         schedule(nextWLReadEvent, nextCycle());
@@ -82,8 +86,8 @@ BaseWLEngine::processNextWLReduceEvent()
     uint32_t value = requestValueMap[resp->req];
     WorkListItem wl =  memoryToWorkList(respData + request_offset);
 
-    DPRINTF(MPU, "%s: The WLE is reading WorkList item: %s %d\n"
-                , __func__, wl.to_string(), value);
+    DPRINTF(MPU, "%s: The WLE is reading WorkList item [%lu]: %s %d\n"
+                , __func__, resp->getAddr() + request_offset, wl.to_string(), value);
     if (value < wl.temp_prop){
         //update prop with temp_prop
         wl.temp_prop = value;
@@ -93,6 +97,10 @@ BaseWLEngine::processNextWLReduceEvent()
         PacketPtr writePkt  =
         getWritePacket(resp->getAddr(), 64, respData, requestorId);
 
+        DPRINTF(MPU, "%s: Sending a pkt with this info. "
+                "pkt->addr: %lu, pkt->size: %lu\npkt->data: %s\n",
+                __func__, writePkt->getAddr(),
+                writePkt->getSize(), writePkt->printData());
         if (!memPortBlocked()) {
             if (sendWLNotif(resp->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index a84ed2d52f..03f74f1019 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -54,18 +54,19 @@ WLEngine::startup()
     //FIXME: This is the current version of our initializer.
     // This should be updated in the future.
     WorkListItem vertices [5] = {
-                                {1000, 1000, 3, 0}, // Addr: 0
-                                {1000, 1000, 1, 3}, // Addr: 16
-                                {1000, 1000, 1, 4}, // Addr: 32
-                                {10000, 1000, 0, 5}, // Addr: 48
-                                {10000, 10000, 0, 5}  // Addr: 64
+                                {10000, 10000, 3, 0}, // Addr: 0
+                                {10000, 10000, 1, 3}, // Addr: 16
+                                {10000, 10000, 1, 4}, // Addr: 32
+                                {10000, 10000, 1, 5}, // Addr: 48
+                                {10000, 10000, 0, 6}  // Addr: 64
                                 };
-    Edge edges [6] = {
+    Edge edges [7] = {
                     {0, 16}, // Addr: 1048576
                     {0, 32}, // Addr: 1048592
                     {0, 48}, // Addr: 1048608
                     {0, 32}, // Addr: 1048624
-                    {0, 64}  // Addr: 1048640
+                    {0, 64},  // Addr: 1048640
+                    {0, 32}
                     };
 
     for (int i = 0; i < 5; i++) {
@@ -75,7 +76,7 @@ WLEngine::startup()
         sendMemFunctional(pkt);
     }
 
-    for (int i = 0; i < 6; i++) {
+    for (int i = 0; i < 7; i++) {
         uint8_t* data = edgeToMemory(edges[i]);
         PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
                                         16, data, 0);
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index 31dc330cab..da45246e49 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -532,4 +532,43 @@ Packet::getHtmTransactionUid() const
     return htmTransactionUid;
 }
 
+std::string
+Packet::printData()
+{
+    char ret[1024];
+    if (isWrite()) {
+        uint8_t* data = getPtr<uint8_t>();
+        std::sprintf(ret,"\n"
+                        "V[%lu] temp_prop: %u, prop: %u, "
+                        "degree: %u, edgeIndex: %u.\n"
+                        "V[%lu] temp_prop: %u, prop: %u, "
+                        "degree: %u, edgeIndex: %u.\n"
+                        "V[%lu] temp_prop: %u, prop: %u, "
+                        "degree: %u, edgeIndex: %u.\n"
+                        "V[%lu] temp_prop: %u, prop: %u, "
+                        "degree: %u, edgeIndex: %u.\n",
+                        getAddr(),
+                        *((uint32_t*) data),
+                        *((uint32_t*) (data + 4)),
+                        *((uint32_t*) (data + 8)),
+                        *((uint32_t*) (data + 12)),
+                        getAddr() + 16,
+                        *((uint32_t*) (data + 16)),
+                        *((uint32_t*) (data + 20)),
+                        *((uint32_t*) (data + 24)),
+                        *((uint32_t*) (data + 28)),
+                        getAddr() + 32,
+                        *((uint32_t*) (data + 32)),
+                        *((uint32_t*) (data + 36)),
+                        *((uint32_t*) (data + 40)),
+                        *((uint32_t*) (data + 44)),
+                        getAddr() + 48,
+                        *((uint32_t*) (data + 48)),
+                        *((uint32_t*) (data + 52)),
+                        *((uint32_t*) (data + 56)),
+                        *((uint32_t*) (data + 60)));
+    }
+    return ret;
+}
+
 } // namespace gem5
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index a67abbbbaa..8803eacced 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -1374,6 +1374,8 @@ class Packet : public Printable
     template <typename T>
     void setRaw(T v);
 
+    std::string printData();
+
   public:
     /**
      * Check a functional request against a memory value stored in

From b1a59999867d57af5d5083da4f3044ee785f6ad7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 22 Mar 2022 01:24:54 -0700
Subject: [PATCH 069/247] Adding coalescer to the code.

---
 src/accl/graph/base/BaseReadEngine.py         |  39 ++++
 src/accl/graph/base/BaseReduceEngine.py       |  38 ++++
 src/accl/graph/base/base_read_engine.cc       |  86 ++++++++
 src/accl/graph/base/base_read_engine.hh       | 101 ++++++++++
 src/accl/graph/base/base_reduce_engine.cc     |  51 +++++
 src/accl/graph/base/base_reduce_engine.hh     |  67 +++++++
 .../graph/base/{ => old}/BaseApplyEngine.py   |   0
 src/accl/graph/base/{ => old}/BaseEngine.py   |   0
 .../graph/base/{ => old}/BasePushEngine.py    |   0
 src/accl/graph/base/{ => old}/BaseWLEngine.py |   0
 .../graph/base/{ => old}/base_apply_engine.cc |   0
 .../graph/base/{ => old}/base_apply_engine.hh |   0
 src/accl/graph/base/{ => old}/base_engine.cc  |   0
 src/accl/graph/base/{ => old}/base_engine.hh  |   0
 .../graph/base/{ => old}/base_push_engine.cc  |   0
 .../graph/base/{ => old}/base_push_engine.hh  |   0
 .../graph/base/{ => old}/base_wl_engine.cc    |   0
 .../graph/base/{ => old}/base_wl_engine.hh    |   0
 src/accl/graph/sega/coalesce_engine.cc        | 187 ++++++++++++++++++
 src/accl/graph/sega/coalesce_engine.hh        |  88 +++++++++
 src/accl/graph/sega/{ => old}/ApplyEngine.py  |   0
 src/accl/graph/sega/{ => old}/LockDir.py      |   0
 src/accl/graph/sega/{ => old}/PushEngine.py   |   0
 src/accl/graph/sega/{ => old}/WLEngine.py     |   0
 src/accl/graph/sega/{ => old}/apply_engine.cc |   0
 src/accl/graph/sega/{ => old}/apply_engine.hh |   0
 src/accl/graph/sega/{ => old}/lock_dir.cc     |   0
 src/accl/graph/sega/{ => old}/lock_dir.hh     |   0
 src/accl/graph/sega/old/push_engine.cc        |  90 +++++++++
 src/accl/graph/sega/old/push_engine.hh        |  77 ++++++++
 src/accl/graph/sega/old/wl_engine.cc          | 156 +++++++++++++++
 src/accl/graph/sega/old/wl_engine.hh          |  86 ++++++++
 src/accl/graph/sega/push_engine.cc            | 144 +++++++++++++-
 src/accl/graph/sega/push_engine.hh            |  32 ++-
 src/accl/graph/sega/wl_engine.cc              | 109 +++++++---
 src/accl/graph/sega/wl_engine.hh              |  37 ++--
 36 files changed, 1338 insertions(+), 50 deletions(-)
 create mode 100644 src/accl/graph/base/BaseReadEngine.py
 create mode 100644 src/accl/graph/base/BaseReduceEngine.py
 create mode 100644 src/accl/graph/base/base_read_engine.cc
 create mode 100644 src/accl/graph/base/base_read_engine.hh
 create mode 100644 src/accl/graph/base/base_reduce_engine.cc
 create mode 100644 src/accl/graph/base/base_reduce_engine.hh
 rename src/accl/graph/base/{ => old}/BaseApplyEngine.py (100%)
 rename src/accl/graph/base/{ => old}/BaseEngine.py (100%)
 rename src/accl/graph/base/{ => old}/BasePushEngine.py (100%)
 rename src/accl/graph/base/{ => old}/BaseWLEngine.py (100%)
 rename src/accl/graph/base/{ => old}/base_apply_engine.cc (100%)
 rename src/accl/graph/base/{ => old}/base_apply_engine.hh (100%)
 rename src/accl/graph/base/{ => old}/base_engine.cc (100%)
 rename src/accl/graph/base/{ => old}/base_engine.hh (100%)
 rename src/accl/graph/base/{ => old}/base_push_engine.cc (100%)
 rename src/accl/graph/base/{ => old}/base_push_engine.hh (100%)
 rename src/accl/graph/base/{ => old}/base_wl_engine.cc (100%)
 rename src/accl/graph/base/{ => old}/base_wl_engine.hh (100%)
 create mode 100644 src/accl/graph/sega/coalesce_engine.cc
 create mode 100644 src/accl/graph/sega/coalesce_engine.hh
 rename src/accl/graph/sega/{ => old}/ApplyEngine.py (100%)
 rename src/accl/graph/sega/{ => old}/LockDir.py (100%)
 rename src/accl/graph/sega/{ => old}/PushEngine.py (100%)
 rename src/accl/graph/sega/{ => old}/WLEngine.py (100%)
 rename src/accl/graph/sega/{ => old}/apply_engine.cc (100%)
 rename src/accl/graph/sega/{ => old}/apply_engine.hh (100%)
 rename src/accl/graph/sega/{ => old}/lock_dir.cc (100%)
 rename src/accl/graph/sega/{ => old}/lock_dir.hh (100%)
 create mode 100644 src/accl/graph/sega/old/push_engine.cc
 create mode 100644 src/accl/graph/sega/old/push_engine.hh
 create mode 100644 src/accl/graph/sega/old/wl_engine.cc
 create mode 100644 src/accl/graph/sega/old/wl_engine.hh

diff --git a/src/accl/graph/base/BaseReadEngine.py b/src/accl/graph/base/BaseReadEngine.py
new file mode 100644
index 0000000000..84c53465b9
--- /dev/null
+++ b/src/accl/graph/base/BaseReadEngine.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseReadEngine(ClockedObject):
+    abstract = True
+    type = 'BaseReadEngine'
+    cxx_header = "accl/graph/base/base_read_engine.hh"
+    cxx_class = 'gem5::BaseReadEngine'
+
+    system = Param.System(Parent.any, 'System this Engine is a part of')
+    mem_port  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/BaseReduceEngine.py b/src/accl/graph/base/BaseReduceEngine.py
new file mode 100644
index 0000000000..0585c36e48
--- /dev/null
+++ b/src/accl/graph/base/BaseReduceEngine.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseReduceEngine(ClockedObject):
+    abstract = True
+    type = 'BaseReduceEngine'
+    cxx_header = "accl/graph/base/base_reduce_engine.hh"
+    cxx_class = 'gem5::BaseReduceEngine'
+
+    system = Param.System(Parent.any, 'System this Engine is a part of')
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
new file mode 100644
index 0000000000..4192cdb565
--- /dev/null
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/base/base_read_engine.hh"
+
+namespace gem5
+{
+
+BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
+    ClockedObject(params),
+    system(params.system),
+    memPort(name() + ".mem_port", this),
+    _requestorId(system.getRequestorId(this)),
+{}
+
+BaseReadEngine::~BaseReadEngine()
+{}
+
+Port&
+BaseReadEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "mem_port") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+void
+BaseReadEngine::MemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+BaseReadEngine::MemPort::recvTimingResp(PacketPtr pkt)
+{
+    //TODO: Investigate sending true all the time
+    return owner->handleMemResp(pkt);
+}
+
+void
+BaseReadEngine::MemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+}
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
new file mode 100644
index 0000000000..99f14bcb06
--- /dev/null
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_READ_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_READ_ENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "base/addr_range.hh"
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "params/BaseEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseReadEngine : public ClockedObject
+{
+  private:
+    class MemPort : public RequestPort
+    {
+      private:
+        BaseReadEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+        public:
+        MemPort(const std::string& name, BaseEngine* owner):
+            RequestPort(name, owner), owner(owner),
+            _blocked(false), blockedPacket(nullptr)
+        {}
+
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+        protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    System* system;
+    MemPort memPort;
+
+    bool handleMemResp(PacketPtr resp);
+
+  protected:
+    const RequestorID _requestorId;
+
+    bool memPortBlocked() { return memPort.blocked(); }
+    void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
+    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+
+    virtual bool handleMemResp(PacketPtr pkt) = 0;
+
+  public:
+    PARAMS(BaseReadEngine);
+
+    BaseReadEngine(const BaseReadEngineParams &params);
+    ~BaseReadEngine();
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
+
+    RequestorID requestorId() { return _requestorId; }
+
+    AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); }
+
+    void recvFunctional(PacketPtr pkt);
+
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc
new file mode 100644
index 0000000000..fbfc613313
--- /dev/null
+++ b/src/accl/graph/base/base_reduce_engine.cc
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/base/base_reduce_engine.hh"
+
+namespace gem5
+{
+
+BaseReduceEngine::BaseReduceEngine(const BaseReduceEngineParams &params):
+    ClockedObject(params),
+    system(params.system),
+    _requestorId(system->getRequestorId(this))
+{}
+
+BaseReduceEngine::~BaseReduceEngine()
+{}
+
+void
+BaseReduceEngine::handleIncomingWL(Addr addr, WorkListItem wl)
+{
+    currentWorkListAddress = addr;
+    currentWorkList = wl;
+    scheduleReduceEvent();
+}
+
+}
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
new file mode 100644
index 0000000000..e44f384f26
--- /dev/null
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_REDUCE_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_REDUCE_ENGINE_HH__
+
+
+#include "accl/base/util.hh"
+#include "params/BaseReduceEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseReduceEngine : public ClockedObject
+{
+  private:
+    System* system;
+
+    bool handleIncomingWL(Addr addr, WorkListItem wl);
+
+  protected:
+    Addr currentWorkListAddress;
+    WorkListItem currentWorkList;
+
+    const RequestorID _requestorId;
+
+    virtual void scheduleReduceEvent() = 0;
+
+  public:
+    PARAMS(BaseReduceEngine);
+
+    BaseReduceEngine(const BaseReduceEngineParams &params);
+    ~BaseReduceEngine();
+
+    RequestorID requestorId() { return _requestorId; }
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/old/BaseApplyEngine.py
similarity index 100%
rename from src/accl/graph/base/BaseApplyEngine.py
rename to src/accl/graph/base/old/BaseApplyEngine.py
diff --git a/src/accl/graph/base/BaseEngine.py b/src/accl/graph/base/old/BaseEngine.py
similarity index 100%
rename from src/accl/graph/base/BaseEngine.py
rename to src/accl/graph/base/old/BaseEngine.py
diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/old/BasePushEngine.py
similarity index 100%
rename from src/accl/graph/base/BasePushEngine.py
rename to src/accl/graph/base/old/BasePushEngine.py
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/old/BaseWLEngine.py
similarity index 100%
rename from src/accl/graph/base/BaseWLEngine.py
rename to src/accl/graph/base/old/BaseWLEngine.py
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/old/base_apply_engine.cc
similarity index 100%
rename from src/accl/graph/base/base_apply_engine.cc
rename to src/accl/graph/base/old/base_apply_engine.cc
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/old/base_apply_engine.hh
similarity index 100%
rename from src/accl/graph/base/base_apply_engine.hh
rename to src/accl/graph/base/old/base_apply_engine.hh
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/old/base_engine.cc
similarity index 100%
rename from src/accl/graph/base/base_engine.cc
rename to src/accl/graph/base/old/base_engine.cc
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/old/base_engine.hh
similarity index 100%
rename from src/accl/graph/base/base_engine.hh
rename to src/accl/graph/base/old/base_engine.hh
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/old/base_push_engine.cc
similarity index 100%
rename from src/accl/graph/base/base_push_engine.cc
rename to src/accl/graph/base/old/base_push_engine.cc
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/old/base_push_engine.hh
similarity index 100%
rename from src/accl/graph/base/base_push_engine.hh
rename to src/accl/graph/base/old/base_push_engine.hh
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/old/base_wl_engine.cc
similarity index 100%
rename from src/accl/graph/base/base_wl_engine.cc
rename to src/accl/graph/base/old/base_wl_engine.cc
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/old/base_wl_engine.hh
similarity index 100%
rename from src/accl/graph/base/base_wl_engine.hh
rename to src/accl/graph/base/old/base_wl_engine.hh
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
new file mode 100644
index 0000000000..1f7a94dc7e
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/sega/coalesce_engine.hh"
+
+#include "accl/sega/wl_engine.hh"
+
+namespace gem5
+{
+
+CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
+    BaseReadEngine(params),
+    reqQueueSize(params.req_queue_size),
+    conflictAddrQueueSize(params.conflict_addr_queue_size),
+    nextWorkListSendEvent([this] { processNextWorkListSendEvent(); }, name()),
+    nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name())
+{}
+
+CoalesceEngine::~CoalesceEngine()
+{}
+
+void
+CoalesceEngine::recvFunctional(PacketPtr pkt)
+{
+    sendMemFunctional(pkt);
+}
+
+void
+CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
+{
+    peerWLEngine = wl_engine;
+}
+
+bool
+CoalesceEngine::recvReadAddr(Addr addr)
+{
+    assert(reqQueue.size() <= reqQueueSize);
+    if (reqQueue.size() == reqQueueSize) {
+        return false;
+    }
+
+    reqQueue.push(addr);
+    if ((!nextRespondEvent.scheduled()) && (!reqQueue.empty())) {
+        schedule(nextRespondEvent, nextCycle());
+    }
+    return true;
+}
+
+void
+CoalesceEngine::processNextRespondEvent()
+{
+    // TODO: Investigate this for optimization
+    Addr addr = reqQueue.front();
+    Addr alligned_addr = (addr / 64) * 64;
+    int block_index = alligned_addr % 256;
+    int wl_offset = (addr - alligned_addr) / 16;
+
+    if (cacheBlocks[block_index].allocated) {
+        // Hit
+        // TODO: I guess this piece of code code could be optimized.
+        // Not the code per se. The design it represents.
+        if (cacheBlocks[block_index].addr == alligned_addr) {
+            if (!cacheBlocks[block_index].taken[wl_offset]) {
+                if (cacheBlocks[block_index].valid) {
+                    peerWLEngine->handleIncomingWL(addr,
+                        cacheBlocks[block_index].items[wl_offset]);
+                    cacheBlocks[block_index].taken[wl_offset] = true;
+                } else {
+                    cacheBlocks[block_index].pending[wl_offset] = true;
+                }
+                reqQueue.pop();
+            }
+        } else { // conflict
+            assert(conflictAddrQueue.size() <= conflictAddrQueueSize);
+            if (conflictAddrQueue.size() < conflictAddrQueueSize) {
+                cacheBlocks[block_index].numConflicts += 1;
+                conflictAddrQueue.push(addr);
+                reqQueue.pop();
+            }
+        }
+    } else {
+        // miss
+        cacheBlocks[block_index].addr = alligned_addr;
+        cacheBlocks[block_index].numConflicts = 0;
+        cacheBlocks[block_index].pending = {false, false, false, false};
+        cacheBlocks[block_index].pending[wl_offset] = true;
+        cacheBlocks[block_index].taken = {false, false, false, false};
+        cacheBlocks[block_index].valid = false;
+        cacheBlocks[block_index].allocated = true;
+
+        PacketPtr pkt = getReadPacket(alligned_addr, 64, _requestorId);
+
+        if (!memPortBlocked()) {
+            sendMemReq(pkt);
+            reqQueue.pop();
+        }
+    }
+
+    if ((!nextRespondEvent.scheduled()) && (!reqQueue.empty())) {
+        schedule(nextRespondEvent, nextCycle());
+    }
+}
+
+/*
+    void recvWLWrite(Addr addr, WorkListItem wl);
+*/
+
+bool
+CoalesceEngine::handleMemResp(PacketPtr pkt)
+{
+    if (pkt->isResp() && pkt->isWrite()) {
+        return true;
+    }
+
+    Addr addr = pkt->getAddr();
+    uint8_t data = pkt->getPtr<uint8_t>();
+
+    int block_index = addr % 256;
+    cacheBlocks[block_index].valid = true;
+
+    for (i = 0; i < 4; i++) {
+        cacheBlocks[block_index].items[i] = memoryToWorkList(data + (i * 16));
+        cacheBlocks[block_index].taken[i] = false;
+        if (cacheBlocks[block_index].pending[i]) {
+            peerWLEngine->handleIncomingWL(addr + (i * 16),
+                cacheBlocks[block_index].items[i]);
+            cacheBlocks[block_index].taken[i] = true;
+        }
+        cacheBlocks[block_index].pending = false;
+    }
+}
+
+void
+CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
+{
+    Addr alligned_addr = (addr / 64) * 64;
+    int block_index = alligned_addr % 256;
+    int wl_offset = (addr - alligned_addr) / 16;
+
+    assert(cacheBlocks[block_index].taken[wl_offset]);
+    cacheBlocks[block_index].item[wl_offset] = wl;
+    cacheBlocks[block_index].taken[wl_offset] = false;
+
+    bool taken_item = false;
+    taken_item &= (cacheBlocks[block_index].taken[0] &
+                    cacheBlocks[block_index].taken[1] &
+                    cacheBlocks[block_index].taken[2] &
+                    cacheBlocks[block_index].taken[3]);
+
+    if (!taken_item) {
+        for (auto conflictAddr : conflictAddrQueue) {
+            int conflict_block_index = ((conflictAddr / 64) * 64) % 256;
+            if (conflict_block_index == block_index) {
+                // Evict cacheBlocks[block_index]
+                // Respond to conflictAddr
+            }
+        }
+    }
+
+}
+
+}
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
new file mode 100644
index 0000000000..0b349b2c1a
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+
+#include "accl/base/base_read_engine.hh"
+
+namespace gem5
+{
+
+class WLEngine;
+
+class CoalesceEngine : public BaseReadEngine
+{
+  private:
+    struct Block
+    {
+        WorkListItem items[4];
+        Addr addr;
+        int numConflicts;
+        bool pending[4];
+        bool taken[4];
+        bool valid;
+        bool allocated;
+    };
+
+    WLEngine* peerWLEngine;
+
+    Block cacheBlocks[256];
+
+    int reqQueueSize;
+    std::queue<Addr> reqQueue;
+
+    int conflictAddrQueueSize;
+    std::queue<Addr> conflictAddrQueue;
+
+    EventFunctionWrapper nextRespondEvent;
+    void processNextRespondEvent();
+
+    EventFunctionWrapper nextApplyAndCommitEvent;
+    void processNextApplyAndCommitEvent();
+
+  protected:
+    virtual bool handleMemResp(PacketPtr pkt);
+
+  public:
+    PARAMS(CoalesceEngine);
+
+    CoalesceEngine(const CoalesceEngineParams &params);
+    ~CoalesceEngine();
+
+    void recvFunctional(PacketPtr pkt);
+
+    bool recvReadAddr(Addr addr);
+    void recvWLWrite(Addr addr, WorkListItem wl);
+
+    void registerWLEngine(WLEngine* wl_engine);
+}
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/old/ApplyEngine.py
similarity index 100%
rename from src/accl/graph/sega/ApplyEngine.py
rename to src/accl/graph/sega/old/ApplyEngine.py
diff --git a/src/accl/graph/sega/LockDir.py b/src/accl/graph/sega/old/LockDir.py
similarity index 100%
rename from src/accl/graph/sega/LockDir.py
rename to src/accl/graph/sega/old/LockDir.py
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/old/PushEngine.py
similarity index 100%
rename from src/accl/graph/sega/PushEngine.py
rename to src/accl/graph/sega/old/PushEngine.py
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/old/WLEngine.py
similarity index 100%
rename from src/accl/graph/sega/WLEngine.py
rename to src/accl/graph/sega/old/WLEngine.py
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/old/apply_engine.cc
similarity index 100%
rename from src/accl/graph/sega/apply_engine.cc
rename to src/accl/graph/sega/old/apply_engine.cc
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/old/apply_engine.hh
similarity index 100%
rename from src/accl/graph/sega/apply_engine.hh
rename to src/accl/graph/sega/old/apply_engine.hh
diff --git a/src/accl/graph/sega/lock_dir.cc b/src/accl/graph/sega/old/lock_dir.cc
similarity index 100%
rename from src/accl/graph/sega/lock_dir.cc
rename to src/accl/graph/sega/old/lock_dir.cc
diff --git a/src/accl/graph/sega/lock_dir.hh b/src/accl/graph/sega/old/lock_dir.hh
similarity index 100%
rename from src/accl/graph/sega/lock_dir.hh
rename to src/accl/graph/sega/old/lock_dir.hh
diff --git a/src/accl/graph/sega/old/push_engine.cc b/src/accl/graph/sega/old/push_engine.cc
new file mode 100644
index 0000000000..c7b229ad33
--- /dev/null
+++ b/src/accl/graph/sega/old/push_engine.cc
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/push_engine.hh"
+
+namespace gem5
+{
+
+PushEngine::PushEngine(const PushEngineParams &params) :
+    BasePushEngine(params),
+    reqPort(name() + "reqPort", this)
+{}
+
+Port&
+PushEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "req_port") {
+        return reqPort;
+    } else {
+        return BasePushEngine::getPort(if_name, idx);
+    }
+}
+
+void
+PushEngine::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+PushEngine::ReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+bool
+PushEngine::sendPushUpdate(PacketPtr pkt)
+{
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(pkt);
+        return true;
+    }
+    return false;
+}
+
+}
diff --git a/src/accl/graph/sega/old/push_engine.hh b/src/accl/graph/sega/old/push_engine.hh
new file mode 100644
index 0000000000..604df4750d
--- /dev/null
+++ b/src/accl/graph/sega/old/push_engine.hh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+
+#include "accl/graph/base/base_push_engine.hh"
+#include "params/PushEngine.hh"
+
+namespace gem5
+{
+
+class MPU;
+
+class PushEngine : public BasePushEngine
+{
+  private:
+    class ReqPort : public RequestPort
+    {
+      private:
+        PushEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        ReqPort(const std::string& name, PushEngine* owner) :
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    ReqPort reqPort;
+
+  protected:
+    virtual bool sendPushUpdate(PacketPtr pkt) override;
+
+  public:
+    PARAMS(PushEngine);
+    PushEngine(const PushEngineParams &params);
+    Port& getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/sega/old/wl_engine.cc b/src/accl/graph/sega/old/wl_engine.cc
new file mode 100644
index 0000000000..03f74f1019
--- /dev/null
+++ b/src/accl/graph/sega/old/wl_engine.cc
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/wl_engine.hh"
+#include "debug/MPU.hh"
+namespace gem5
+{
+
+WLEngine::WLEngine(const WLEngineParams &params):
+    BaseWLEngine(params),
+    respPort(name() + ".respPort", this),
+    applyEngine(params.apply_engine),
+    lockDir(params.lock_dir)
+{}
+
+Port&
+WLEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "resp_port") {
+        return respPort;
+    } else {
+        return BaseWLEngine::getPort(if_name, idx);
+    }
+}
+
+void
+WLEngine::startup()
+{
+    //FIXME: This is the current version of our initializer.
+    // This should be updated in the future.
+    WorkListItem vertices [5] = {
+                                {10000, 10000, 3, 0}, // Addr: 0
+                                {10000, 10000, 1, 3}, // Addr: 16
+                                {10000, 10000, 1, 4}, // Addr: 32
+                                {10000, 10000, 1, 5}, // Addr: 48
+                                {10000, 10000, 0, 6}  // Addr: 64
+                                };
+    Edge edges [7] = {
+                    {0, 16}, // Addr: 1048576
+                    {0, 32}, // Addr: 1048592
+                    {0, 48}, // Addr: 1048608
+                    {0, 32}, // Addr: 1048624
+                    {0, 64},  // Addr: 1048640
+                    {0, 32}
+                    };
+
+    for (int i = 0; i < 5; i++) {
+        uint8_t* data = workListToMemory(vertices[i]);
+        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
+                                        16, data, 0);
+        sendMemFunctional(pkt);
+    }
+
+    for (int i = 0; i < 7; i++) {
+        uint8_t* data = edgeToMemory(edges[i]);
+        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
+                                        16, data, 0);
+        sendMemFunctional(pkt);
+    }
+
+    uint8_t* first_update_data = new uint8_t [4];
+    uint32_t* tempPtr = (uint32_t*) first_update_data;
+    *tempPtr = 0;
+
+    PacketPtr first_update = getUpdatePacket(
+        0, 4, first_update_data, requestorId);
+
+    handleWLUpdate(first_update);
+}
+
+bool
+WLEngine::sendWLNotif(Addr addr){
+    return applyEngine->recvWLNotif(addr);
+}
+
+AddrRangeList
+WLEngine::RespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+bool
+WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
+{
+    return owner->handleWLUpdate(pkt);
+}
+
+Tick
+WLEngine::RespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+WLEngine::RespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+WLEngine::RespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+WLEngine::recvFunctional(PacketPtr pkt)
+{
+    // FIXME: This needs to be fixed
+    // if (pkt->cmd == MemCmd::UpdateWL) {
+    //     panic("Functional requests should not be made to WL.");
+    //     //TODO: Might be a good idea to implement later.
+    //     // wlEngine->recvFunctional(pkt);
+    // } else {
+        sendMemFunctional(pkt);
+    // }
+}
+
+bool
+WLEngine::acquireAddress(Addr addr)
+{
+    return lockDir->acquire(addr, requestorId);
+}
+
+bool
+WLEngine::releaseAddress(Addr addr)
+{
+    return lockDir->release(addr, requestorId);
+}
+
+}
diff --git a/src/accl/graph/sega/old/wl_engine.hh b/src/accl/graph/sega/old/wl_engine.hh
new file mode 100644
index 0000000000..4e8a25795a
--- /dev/null
+++ b/src/accl/graph/sega/old/wl_engine.hh
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "accl/graph/base/base_wl_engine.hh"
+#include "accl/graph/sega/apply_engine.hh"
+#include "accl/graph/sega/lock_dir.hh"
+#include "params/WLEngine.hh"
+
+namespace gem5
+{
+
+class ApplyEngine;
+
+class WLEngine : public BaseWLEngine
+{
+  private:
+    class RespPort : public ResponsePort
+    {
+      private:
+        WLEngine* owner;
+
+      public:
+        RespPort(const std::string& name, WLEngine* owner):
+          ResponsePort(name, owner), owner(owner)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    RespPort respPort;
+    ApplyEngine* applyEngine;
+    LockDirectory* lockDir;
+
+    virtual void startup();
+    void recvFunctional(PacketPtr pkt);
+
+  protected:
+    virtual bool sendWLNotif(Addr addr) override;
+    virtual bool acquireAddress(Addr addr) override;
+    virtual bool releaseAddress(Addr addr) override;
+
+  public:
+    PARAMS(WLEngine);
+    WLEngine(const WLEngineParams &params);
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
+};
+
+}
+#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c7b229ad33..c865451999 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -31,9 +31,16 @@
 namespace gem5
 {
 
-PushEngine::PushEngine(const PushEngineParams &params) :
-    BasePushEngine(params),
-    reqPort(name() + "reqPort", this)
+PushEngine::PushEngine(const PushEngineParams &params):
+    BaseReadEngine(params),
+    reqPort(name() + ".req_port", this),
+    baseEdgeAddr(params.base_edge_addr),
+    memRespQueueSize(params.mem_resp_queue_size),
+    pushReqQueueSize(params.push_req_queue_size),
+    onTheFlyReadReqs(0),
+    nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
+    nextReadEvent([this] { processNextReadEvent(); }, name()),
+    nextPushEvent([this] { processNextPushEvent(); }, name())
 {}
 
 Port&
@@ -41,8 +48,10 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name == "req_port") {
         return reqPort;
+    } else if (if_name == "mem_port") {
+        return BaseReadEngine::getPort(if_name, idx);
     } else {
-        return BasePushEngine::getPort(if_name, idx);
+        return SimObject::getPort(if_name, idx);
     }
 }
 
@@ -78,13 +87,130 @@ PushEngine::ReqPort::recvReqRetry()
 }
 
 bool
-PushEngine::sendPushUpdate(PacketPtr pkt)
+PushEngine::recvWLItem(WorkListItem wl);
 {
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(pkt);
-        return true;
+    assert(pushReqQueue.size() <= pushReqQueueSize);
+    if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
+        return false;
+    }
+    pushReqQueue.push(wl);
+
+    if ((!nextAddrGenEvent.scheduled()) &&
+        (!pushReqQueue.empty())) {
+        schedule(nextAddrGenEvent, nextCycle());
+    }
+    return true;
+}
+
+void
+PushEngine::processNextAddrGenEvent()
+{
+    WorkListItem wl = pushReqQueue.front();
+
+    std::vector<Addr> addr_queue;
+    std::vector<Addr> offset_queue;
+    std::vector<int> num_edge_queue;
+
+    for (uint32_t index = 0; index < wl.degree; index++) {
+        Addr edge_addr = baseEdgeAddr + (wl.edgeIndex + index) * sizeof(Edge);
+        Addr req_addr = (edge_addr / 64) * 64;
+        Addr req_offset = edge_addr % 64;
+        if (addr_queue.size()) {
+            if (addr_queue.back() == req_addr) {
+                num_edge_queue.back()++;
+            }
+            else {
+                addr_queue.push_back(req_addr);
+                offset_queue.push_back(req_offset);
+                num_edge_queue.push_back(1);
+            }
+        }
+        else {
+            addr_queue.push_back(req_addr);
+            offset_queue.push_back(req_offset);
+            num_edge_queue.push_back(1);
+        }
+    };
+
+    for (int index = 0; index < addr_queue.size(); index++) {
+        PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
+        reqOffsetMap[pkt->req] = offset_queue[index];
+        reqNumEdgeMap[pkt->req] = num_edge_queue[index];
+        reqValueMap[pkt->req] = wl.prop;
+        pendingReadReqs.push(pkt);
+    }
+
+    pushReadReqs.pop();
+
+    if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
+        schedule(nextAddrGenEvent, nextCycle());
+    }
+
+    if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) {
+        schedule(nextReadEvent, nextCycle());
+    }
+}
+
+void
+PushEngine::processNextReadEvent()
+{
+    if (((memRespQueue.size() + onTheFlyReadReqs) <= memRespQueueSize) &&
+        (!memPortBlocked())) {
+        PacketPtr pkt = pendingReadReqs.front();
+        sendMemReq(pkt);
+        onTheFlyReadReqs++;
+        pendingReadReqs.pop();
+    }
+
+    if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) {
+        schedule(nextReadEvent, nextCycle());
+    }
+}
+
+bool
+PushEngine::handleMemResp(PacketPtr pkt)
+{
+    onTheFlyReadReqs--;
+    memRespQueue.push(pkt);
+
+    if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
+        schedule(nextPushEvent, nextCycle());
+    }
+}
+
+void
+PushEngine::processNextPushEvent()
+{
+    PacketPtr pkt = memRespQueue.front();
+    RequestPtr req = pkt->req;
+    uint8_t *data = pkt->getPtr<uint8_t>();
+
+    Addr offset = reqOffsetMap[req];
+    int num_edges = reqNumEdgeMap[req];
+    uint32_t value = reqValueMap[req];
+
+    int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
+    for (int i = 0; i < num_edges; i++) {
+        uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
+        Edge e = memoryToEdge(curr_edge_data);
+        int data_size = sizeof(uint32_t) / sizeof(uint8_t);
+        uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
+        // TODO: Implement propagate function here
+        *update_data = value + 1;
+        PacketPtr update = getUpdatePacket(e.neighbor,
+            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
+            requestorId);
+        if (sendPushUpdate(update) && (i == num_edges - 1)) {
+            memRespQueue.pop();
+            DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
+                , __func__, e.to_string(), *update_data);
+            // TODO: Erase map entries here.
+        }
+    }
+
+    if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
+        schedule(nextPushEvent, nextCycle());
     }
-    return false;
 }
 
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 604df4750d..bf645eb119 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -29,15 +29,13 @@
 #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
-#include "accl/graph/base/base_push_engine.hh"
+#include "accl/graph/base/base_read_engine.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
 {
 
-class MPU;
-
-class PushEngine : public BasePushEngine
+class PushEngine : public BaseReadEngine
 {
   private:
     class ReqPort : public RequestPort
@@ -62,14 +60,38 @@ class PushEngine : public BasePushEngine
 
     ReqPort reqPort;
 
+    Addr baseEdgeAddr;
+
+    int pushReqQueueSize;
+    std::queue<WorkListItem> pushReqQueue;
+
+    // TODO: Possibility of infinite queueing
+    std::queue<PacketPtr> pendingReadReqs;
+
+    int memRespQueueSize;
+    int onTheFlyReadReqs;
+    std::queue<PacketPtr> memRespQueue;
+
+    EventFunctionWrapper nextAddrGenEvent;
+    void processNextAddrGenEvent();
+
+    EventFunctionWrapper nextReadEvent;
+    void processNextReadEvent();
+
+    EventFunctionWrapper nextPushEvent;
+    void processNextPushEvent();
+
   protected:
-    virtual bool sendPushUpdate(PacketPtr pkt) override;
+    virtual bool handleMemResp(PacketPtr pkt);
 
   public:
     PARAMS(PushEngine);
     PushEngine(const PushEngineParams &params);
+
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
+
+    bool recvWLItem(WorkListItem wl);
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 03f74f1019..f0c522ff6f 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -28,15 +28,22 @@
 
 #include "accl/graph/sega/wl_engine.hh"
 #include "debug/MPU.hh"
+
 namespace gem5
 {
 
 WLEngine::WLEngine(const WLEngineParams &params):
-    BaseWLEngine(params),
-    respPort(name() + ".respPort", this),
-    applyEngine(params.apply_engine),
-    lockDir(params.lock_dir)
-{}
+    BaseReduceEngine(params),
+    respPort(name() + ".resp_port", this),
+    blockedByCoalescer(false),
+    coaleseEngine(params.coalesce_engine),
+    updateQueueSize(params.update_queue_size),
+    onTheFlyUpdateMapSize(params.on_the_fly_update_map_size),
+    nextReadEvent([this]{ processNextReadEvent(); }, name()),
+    nextReduceEvent([this]{ processNextReduceEvent(); }, name())
+{
+    coaleseEngine->registerWLEngine(this);
+}
 
 Port&
 WLEngine::getPort(const std::string &if_name, PortID idx)
@@ -44,7 +51,7 @@ WLEngine::getPort(const std::string &if_name, PortID idx)
     if (if_name == "resp_port") {
         return respPort;
     } else {
-        return BaseWLEngine::getPort(if_name, idx);
+        return BaseReduceEngine::getPort(if_name, idx);
     }
 }
 
@@ -53,6 +60,8 @@ WLEngine::startup()
 {
     //FIXME: This is the current version of our initializer.
     // This should be updated in the future.
+    //FIXME: The WLEngine no longer has a MemPort. Update this to
+    // work with the CoalesceEngine instead.
     WorkListItem vertices [5] = {
                                 {10000, 10000, 3, 0}, // Addr: 0
                                 {10000, 10000, 1, 3}, // Addr: 16
@@ -93,11 +102,6 @@ WLEngine::startup()
     handleWLUpdate(first_update);
 }
 
-bool
-WLEngine::sendWLNotif(Addr addr){
-    return applyEngine->recvWLNotif(addr);
-}
-
 AddrRangeList
 WLEngine::RespPort::getAddrRanges() const
 {
@@ -107,7 +111,7 @@ WLEngine::RespPort::getAddrRanges() const
 bool
 WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
 {
-    return owner->handleWLUpdate(pkt);
+    return owner->handleIncomingUpdate(pkt);
 }
 
 Tick
@@ -131,26 +135,81 @@ WLEngine::RespPort::recvRespRetry()
 void
 WLEngine::recvFunctional(PacketPtr pkt)
 {
-    // FIXME: This needs to be fixed
-    // if (pkt->cmd == MemCmd::UpdateWL) {
-    //     panic("Functional requests should not be made to WL.");
-    //     //TODO: Might be a good idea to implement later.
-    //     // wlEngine->recvFunctional(pkt);
-    // } else {
-        sendMemFunctional(pkt);
-    // }
+    coaleseEngine->recvFunctional(pkt);
 }
 
-bool
-WLEngine::acquireAddress(Addr addr)
+AddrRangeList
+WLEngine::getAddrRanges()
 {
-    return lockDir->acquire(addr, requestorId);
+    return coaleseEngine->getAddrRanges();
+}
+
+void
+WLEngine::processNextReadEvent()
+{
+    PacketPtr update = updateQueue.front();
+    Addr update_addr = update->getAddr();
+    uint32_t update_value = update->getPtr<uint32_t>();
+
+    if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) &&
+        (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) {
+        if (coalesceEngine->recvReadAddr(update_addr)) {
+            onTheFlyUpdateMap[update_addr] = update_value
+            updateQueue.pop();
+        }
+    } else {
+        // TODO: Generalize this to reduce function rather than just min
+        onTheFlyUpdateMap[update_addr] =
+                min(update_addr, onTheFlyUpdateMap[update_addr]);
+        updateQueue.pop();
+        // TODO: Add a stat to count the number of coalescions
+    }
+
+    if ((!nextReadEvent.scheduled()) &&
+        ((!updateQueue.empty()) ||
+        (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize))) {
+        schedule(nextReadEvent, nextCycle());
+    }
+}
+
+void
+WLEngine::processNextReduceEvent()
+{
+    // TODO: Generalize this to reduce function rather than just min
+    currentWorkList.temp_prop = min(onTheFlyUpdateMap[currentWorkListAddress],
+                                    currentWorkList.temp_prop);
+    // TODO: Add a delay here
+    coalesceEngine->recvWLWrite(currentWorkListAddress, currentWorkList);
+
+    onTheFlyUpdateMap.erase(currentWorkListAddress);
+    currentWorkListAddress = 0;
+    currentWorkList = {0, 0, 0, 0};
+}
+
+void
+WLEngine::scheduleReduceEvent()
+{
+    // TODO: Add checks to see if scheduling is necessary or correct.
+    if (!nextReduceEvent.scheduled()) {
+        schedule(nextReduceEvent, nextCycle());
+    }
 }
 
 bool
-WLEngine::releaseAddress(Addr addr)
+WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
-    return lockDir->release(addr, requestorId);
+    // TODO: Coalesce updates here too
+    assert(updateQueue.size() <= updateQueueSize);
+    if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
+        return false;
+    }
+
+    updateQueue.push(pkt);
+    if ((!nextReadEvent.scheduled()) &&
+        (!updateQueue.empty())) {
+        schedule(nextReadEvent, nextCycle());
+    }
+    return true;
 }
 
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 4e8a25795a..1846825951 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -32,17 +32,14 @@
 #include <queue>
 #include <unordered_map>
 
-#include "accl/graph/base/base_wl_engine.hh"
-#include "accl/graph/sega/apply_engine.hh"
-#include "accl/graph/sega/lock_dir.hh"
+#include "accl/graph/base/base_reduce_engine.hh"
+#include "accl/graph/sega/coalesce_engine.hh"
 #include "params/WLEngine.hh"
 
 namespace gem5
 {
 
-class ApplyEngine;
-
-class WLEngine : public BaseWLEngine
+class WLEngine : public BaseReduceEngine
 {
   private:
     class RespPort : public ResponsePort
@@ -64,22 +61,40 @@ class WLEngine : public BaseWLEngine
     };
 
     RespPort respPort;
-    ApplyEngine* applyEngine;
-    LockDirectory* lockDir;
+
+    bool blockedByCoalescer;
+    CoalesceEngine* coaleseEngine;
+
+    int updateQueueSize;
+    std::queue<PacketPtr> updateQueue;
+
+    int onTheFlyUpdateMapSize;
+    std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;
 
     virtual void startup();
+
     void recvFunctional(PacketPtr pkt);
 
+    AddrRangeList getAddrRanges() const;
+
+    EventFunctionWrapper nextReadEvent;
+    void processNextReadEvent();
+
+    EventFunctionWrapper nextReduceEvent;
+    void processNextReduceEvent();
+
   protected:
-    virtual bool sendWLNotif(Addr addr) override;
-    virtual bool acquireAddress(Addr addr) override;
-    virtual bool releaseAddress(Addr addr) override;
+    virtual void scheduleReduceEvent() = 0;
 
   public:
     PARAMS(WLEngine);
+
     WLEngine(const WLEngineParams &params);
+
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
+
+    bool handleIncomingUpdate(PacketPtr pkt);
 };
 
 }

From 4cc59dc9487d376ee1185cabad60a7ead7b1b564 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 22 Mar 2022 16:01:55 -0700
Subject: [PATCH 070/247] Finalizing source code. Before compile.

---
 src/accl/graph/base/SConscript         |  12 +-
 src/accl/graph/sega/CoalesceEngine.py  |  40 ++++
 src/accl/graph/sega/PushEngine.py      |  40 ++++
 src/accl/graph/sega/SConscript         |   8 +-
 src/accl/graph/sega/WLEngine.py        |  40 ++++
 src/accl/graph/sega/coalesce_engine.cc | 306 ++++++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh |  30 ++-
 7 files changed, 377 insertions(+), 99 deletions(-)
 create mode 100644 src/accl/graph/sega/CoalesceEngine.py
 create mode 100644 src/accl/graph/sega/PushEngine.py
 create mode 100644 src/accl/graph/sega/WLEngine.py

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index c5c8c4e901..c6a78eb5e8 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,15 +27,11 @@
 
 Import('*')
 
-SimObject('BaseApplyEngine.py')
-SimObject('BaseEngine.py')
-SimObject('BasePushEngine.py')
-SimObject('BaseWLEngine.py')
+SimObject('BaseReadEngine.py')
+SimObject('BaseReduceEngine.py')
 
-Source('base_apply_engine.cc')
-Source('base_engine.cc')
-Source('base_push_engine.cc')
-Source('base_wl_engine.cc')
+Source('base_read_engine.cc')
+Source('base_reduce_engine.cc')
 Source('util.cc')
 
 DebugFlag('MPU')
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
new file mode 100644
index 0000000000..0330da7576
--- /dev/null
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseReadEngine import BaseReadEngine
+
+class CoalesceEngine(BaseReadEngine):
+    type = 'CoalesceEngine'
+    cxx_header = "accl/graph/sega/coalesce_engine.hh"
+    cxx_class = 'gem5::CoalesceEngine'
+    
+    peer_push_engine = Param.PushEngine(NULL, "")
+    num_mshr_entry = Param.Int(4, "")
+    num_tgts_per_mshr = Param.Int(20, "")
+    outstanding_mem_req_queue_size = Param.Int(20, "")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
new file mode 100644
index 0000000000..9036b4e401
--- /dev/null
+++ b/src/accl/graph/sega/PushEngine.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseReadEngine import BaseReadEngine
+
+class PushEngine(BaseReadEngine):
+    type = 'PushEngine'
+    cxx_header = "accl/graph/sega/push_engine.hh"
+    cxx_class = 'gem5::PushEngine'
+
+    req_port  = RequestPort("Port to send updates to the outside")
+    base_edge_addr = Param.Addr()
+    mem_resp_queue_size = Param.Int(0, "")
+    push_req_queue_size = Param.Int(0, "")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index e6d2f1fbbc..9b4629838b 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -27,12 +27,12 @@
 
 Import('*')
 
-SimObject('ApplyEngine.py')
-SimObject('LockDir.py')
+SimObject('CoalesceEngine.py')
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
-Source('apply_engine.cc')
-Source('lock_dir.cc')
+Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
+
+DebugFlag('MPU')
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
new file mode 100644
index 0000000000..ec9154b138
--- /dev/null
+++ b/src/accl/graph/sega/WLEngine.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseReduceEngine import BaseReduceEngine
+
+class WLEngine(BaseReduceEngine):
+    type = 'WLEngine'
+    cxx_header = "accl/graph/sega/wl_engine.hh"
+    cxx_class = 'gem5::WLEngine'
+
+    resp_port = ResponsePort("Port to Receive updates from outside")
+    coalesce_engine = Param.CoaleseEngine(NULL, "")
+    update_queue_size = Param.Int(0, "")
+    on_the_fly_update_map_size = Param.Int(4, "") # 4 is arbitrary
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 1f7a94dc7e..22bc0d49a6 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -29,14 +29,17 @@
 #include "accl/sega/coalesce_engine.hh"
 
 #include "accl/sega/wl_engine.hh"
+#include "debug/MPU.hh"
 
 namespace gem5
 {
 
 CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     BaseReadEngine(params),
-    reqQueueSize(params.req_queue_size),
-    conflictAddrQueueSize(params.conflict_addr_queue_size),
+    peerPushEngine(params.peer_push_engine),
+    numMSHREntry(params.num_mshr_entry),
+    numTgtsPerMSHR(params.num_tgts_per_mshr),
+    outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
     nextWorkListSendEvent([this] { processNextWorkListSendEvent(); }, name()),
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name())
 {}
@@ -59,69 +62,100 @@ CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
 bool
 CoalesceEngine::recvReadAddr(Addr addr)
 {
-    assert(reqQueue.size() <= reqQueueSize);
-    if (reqQueue.size() == reqQueueSize) {
-        return false;
-    }
-
-    reqQueue.push(addr);
-    if ((!nextRespondEvent.scheduled()) && (!reqQueue.empty())) {
-        schedule(nextRespondEvent, nextCycle());
-    }
-    return true;
-}
-
-void
-CoalesceEngine::processNextRespondEvent()
-{
-    // TODO: Investigate this for optimization
-    Addr addr = reqQueue.front();
+    assert(MSHRMap.size() <= numMSHREntry);
     Addr alligned_addr = (addr / 64) * 64;
     int block_index = alligned_addr % 256;
     int wl_offset = (addr - alligned_addr) / 16;
 
-    if (cacheBlocks[block_index].allocated) {
+    if ((cacheBlocks[block_index].addr == alligned_addr) &&
+        (cacheBlocks[block_index].valid)) {
         // Hit
-        // TODO: I guess this piece of code code could be optimized.
-        // Not the code per se. The design it represents.
-        if (cacheBlocks[block_index].addr == alligned_addr) {
-            if (!cacheBlocks[block_index].taken[wl_offset]) {
-                if (cacheBlocks[block_index].valid) {
-                    peerWLEngine->handleIncomingWL(addr,
-                        cacheBlocks[block_index].items[wl_offset]);
-                    cacheBlocks[block_index].taken[wl_offset] = true;
-                } else {
-                    cacheBlocks[block_index].pending[wl_offset] = true;
-                }
-                reqQueue.pop();
-            }
-        } else { // conflict
-            assert(conflictAddrQueue.size() <= conflictAddrQueueSize);
-            if (conflictAddrQueue.size() < conflictAddrQueueSize) {
-                cacheBlocks[block_index].numConflicts += 1;
-                conflictAddrQueue.push(addr);
-                reqQueue.pop();
-            }
+        addrResponseQueue.push(addr);
+        worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]);
+        cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+        if ((!nextRespondEvent.scheduled()) &&
+            (!worklistResponseQueue.empty()) && 
+            (!addrResponseQueue.empty())) {
+            schedule(nextRespondEvent, nextCycle());
         }
+        return true;
     } else {
         // miss
-        cacheBlocks[block_index].addr = alligned_addr;
-        cacheBlocks[block_index].numConflicts = 0;
-        cacheBlocks[block_index].pending = {false, false, false, false};
-        cacheBlocks[block_index].pending[wl_offset] = true;
-        cacheBlocks[block_index].taken = {false, false, false, false};
-        cacheBlocks[block_index].valid = false;
-        cacheBlocks[block_index].allocated = true;
-
-        PacketPtr pkt = getReadPacket(alligned_addr, 64, _requestorId);
-
-        if (!memPortBlocked()) {
-            sendMemReq(pkt);
-            reqQueue.pop();
+        if (MSHRMap.find(block_index) == MSHRMap.end()) {
+            if (MSHRMap.size() == numMSHREntry) {
+                // Out of MSHR entries
+                return false;
+            } else {
+                if (cacheBlock[block_index].allocated) {
+                    assert(MSHRMap[block_index].size() <= numTgtsPerMSHR)
+                    if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
+                        return false;
+                    }
+                    // MSHR available but conflict
+                    cacheBlocks[block_index].hasConflict = true;
+                    MSHRMap[block_index].push_back(addr);
+                    return true;
+                } else {
+                    // MSHR available and no conflict
+                    assert(
+                        outstandingMemReqQueue.size() <= 
+                        outstandingMemReqQueueSize);
+                    if (outstandingMemReqQueue.size() == 
+                        outstandingMemReqQueueSize) {
+                        return false;
+                    }
+                    cacheBlocks[block_index].addr = alligned_addr;
+                    cacheBlocks[block_index].takenMask = 0;
+                    cacheBlocks[block_index].allocated = true;
+                    cacheBlocks[block_index].valid = false;
+                    cacheBlocks[block_index].hasConflict = false;
+
+                    MSHRMap[block_index].push_back(addr);
+                    PacketPtr pkt = getReadPacket(alligned_addr, 
+                                                64, _requestorId);
+                    outstandingMemReqQueue.push(pkt);
+
+                    if ((!nextMemReqEvent.scheduled()) &&
+                        (!outstandingMemReqQueue.empty())) {
+                        schedule(nextMemReqEvent, nextCycle());
+                    }
+                    return true;
+                }
+            }
         }
+    }   
+}
+
+void
+CoalesceEngine::processNextMemReqEvent()
+{
+    PacketPtr pkt = outstandingMemReqQueue.front();
+
+    if (!memPortBlocked()) {
+        sendMemReq(pkt);
+        outstandingMemReqQueue.pop();
+    }
+
+    if ((!nextMemReqEvent.scheduled()) &&
+        (!outstandingMemReqQueue.empty())) {
+        schedule(nextMemReqEvent, nextCycle()); 
     }
+}
+
+void
+CoalesceEngine::processNextRespondEvent()
+{
+    Addr addr_response = addrResponseQueue.front();
+    WorkListItem worklist_response = worklistResponseQueue.front();
+    
+    peerWLEngine->handleIncomingWL(addr_response, worklist_response);
 
-    if ((!nextRespondEvent.scheduled()) && (!reqQueue.empty())) {
+    addrResponseQueue.pop();
+    worklistResponseQueue.pop();
+
+    if ((!nextRespondEvent.scheduled()) &&
+        (!worklistResponseQueue.empty()) && 
+        (!addrResponseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
 }
@@ -139,19 +173,50 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
     Addr addr = pkt->getAddr();
     uint8_t data = pkt->getPtr<uint8_t>();
-
     int block_index = addr % 256;
+
+    assert((cacheBlocks[block_index].allocated) && // allocated cache block
+            (!cacheBlocks[block_index].valid) &&    // valid is false
+            (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
     cacheBlocks[block_index].valid = true;
 
-    for (i = 0; i < 4; i++) {
+    for (int i = 0; i < 4; i++) {
         cacheBlocks[block_index].items[i] = memoryToWorkList(data + (i * 16));
-        cacheBlocks[block_index].taken[i] = false;
-        if (cacheBlocks[block_index].pending[i]) {
-            peerWLEngine->handleIncomingWL(addr + (i * 16),
-                cacheBlocks[block_index].items[i]);
-            cacheBlocks[block_index].taken[i] = true;
+    }
+
+    int bias = 0;
+    std::vector<int> servicedIndices;
+    for (int i = 0; i < MSHRMap[block_index].size(); i++) {
+        Addr miss_addr = MSHRMap[block_index][i];
+        Addr alligned_miss_addr = (miss_addr / 64) * 64;
+
+        if (alligned_miss_addr == addr) {
+            int wl_offset = (miss_addr - alligned_miss_addr) / 16;
+            addrResponseQueue.push(miss_addr);
+            worklistResponseQueue.push(
+                cacheBlocks[block_index].items[wl_offset]);
+            cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+            servicedIndices.push_back(i);    
         }
-        cacheBlocks[block_index].pending = false;
+    }
+    // TODO: We Can use taken instead of this
+    for (int i = 0; i < servicedIndices.size(); i++) {
+        MSHRMap[block_index].erase(MSHRMap[block_index].begin() + 
+                                    servicedIndices[i] - bias);
+        bias++;
+    }
+
+    if (MSHRMap[block_index].empty()) {
+        MSHRMap.erase(block_index);
+        cacheBlocks[block_index].hasConflict = false;
+    } else {
+        cacheBlocks[block_index].hasConflict = true;
+    }
+
+    if ((!nextRespondEvent.scheduled()) &&
+        (!worklistResponseQueue.empty()) && 
+        (!addrResponseQueue.empty())) {
+        schedule(nextRespondEvent, nextCycle());
     }
 }
 
@@ -162,26 +227,111 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     int block_index = alligned_addr % 256;
     int wl_offset = (addr - alligned_addr) / 16;
 
-    assert(cacheBlocks[block_index].taken[wl_offset]);
+    assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) == 
+            (1 << wl_offset));
     cacheBlocks[block_index].item[wl_offset] = wl;
-    cacheBlocks[block_index].taken[wl_offset] = false;
-
-    bool taken_item = false;
-    taken_item &= (cacheBlocks[block_index].taken[0] &
-                    cacheBlocks[block_index].taken[1] &
-                    cacheBlocks[block_index].taken[2] &
-                    cacheBlocks[block_index].taken[3]);
-
-    if (!taken_item) {
-        for (auto conflictAddr : conflictAddrQueue) {
-            int conflict_block_index = ((conflictAddr / 64) * 64) % 256;
-            if (conflict_block_index == block_index) {
-                // Evict cacheBlocks[block_index]
-                // Respond to conflictAddr
-            }
+    cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
+    
+    // TODO: Make this more general and programmable.
+    // && (cacheBlocks[block_index].hasConflict)
+    if ((cacheBlocks[block_index].takenMask == 0)) {
+        evictQueue.push(block_index);
+    }
+
+    if ((!nextApplyAndCommitEvent.scheduled()) &&
+        (!evictQueue.empty())) {
+        schedule(nextApplyAndCommitEvent, nextCycle());
+    }
+
+}
+
+void
+CoalesceEngine::processNextApplyAndCommitEvent()
+{
+    int block_index = evictQueue.front();
+    uint8_t changedMask = 0;
+    uint8_t data[64];
+
+    for (int i = 0; i < 4; i++) {
+        uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
+        cacheBlocks[block_index].items[i].prop = std::min(
+            cacheBlocks[block_index].items[i].prop,
+            cacheBlocks[block_index].items[i].temp_prop);
+        if (old_prop != cacheBlocks[block_index].items[i].prop) {
+            changedMask |= (1 << i);
         }
+        uint8_t* wl_data = workListToMemory(cacheBlocks[block_index].items[i]);
+        std::memcpy(data[i * 16], wl_data, sizeof(WorkListItem));
     }
 
+    if (changed) {
+        assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
+        PacketPtr write_pkt = getWritePacket(
+            cacheBlocks[block_index].addr, 64, data, _requestorId);
+        
+        if ((cacheBlocks[block_index].hasConflict) &&
+            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)){
+            Addr miss_addr = MSHRMap[block_index][0];
+            // TODO: Make sure this trick works;
+            Addr alligned_miss_addr = (miss_addr / 64) * 64;
+            PacketPtr read_pkt = getReadPacket(
+                    alligned_miss_addr, 64, _requestorId);
+            outstandingMemReqQueue.push(write_pkt);
+            outstandingMemReqQueue.push(read_pkt);
+            // TODO: This should be improved
+            if ((changedMask & (1)) == 1) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+            }
+            if ((changedMask & (2)) == 2) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+            }
+            if ((changedMask & (4)) == 4) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+            }
+            if ((changedMask & (8)) == 8) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+            }
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = true;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = true;
+            evictQueue.pop();
+        } else if ((!cacheBlocks[block_index].hasConflict) &&
+            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) { 
+            outstandingMemReqQueue.push(write_pkt);
+            // TODO: This should be improved
+            if ((changedMask & (1)) == 1) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+            }
+            if ((changedMask & (2)) == 2) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+            }
+            if ((changedMask & (4)) == 4) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+            }
+            if ((changedMask & (8)) == 8) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+            }
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = false;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = false;
+            evictQueue.pop();
+        } else {
+            DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" , 
+                __func__);
+        }
+    }
+    
+    if ((!nextMemReqEvent.scheduled()) &&
+        (!outstandingMemReqQueue.empty())) {
+        schedule(nextMemReqEvent, nextCycle()); 
+    }
+    
+    if ((!nextApplyAndCommitEvent.scheduled()) &&
+        (!evictQueue.empty())) {
+        schedule(nextApplyAndCommitEvent, nextCycle());
+    }
 }
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 0b349b2c1a..f5fd85e4cf 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -30,6 +30,7 @@
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
 #include "accl/base/base_read_engine.hh"
+#include "accl/sega/push_engine.hh"
 
 namespace gem5
 {
@@ -43,22 +44,33 @@ class CoalesceEngine : public BaseReadEngine
     {
         WorkListItem items[4];
         Addr addr;
-        int numConflicts;
-        bool pending[4];
-        bool taken[4];
-        bool valid;
+        uint8_t takenMask;
         bool allocated;
+        bool valid;
+        bool hasConflict;
+        // TODO: This might be useful in the future
+        // Tick lastWLWriteTick;
     };
 
     WLEngine* peerWLEngine;
-
+    PushEngine* peerPushEngine;
+    
     Block cacheBlocks[256];
 
-    int reqQueueSize;
-    std::queue<Addr> reqQueue;
+    int numMSHREntry;
+    int numTgtsPerMSHR;
+    std::unordered_map<int, std::vector<Addr>> MSHRMap;
+
+    int outstandingMemReqQueueSize;
+    std::queue<PacketPtr> outstandingMemReqQueue;
+
+    std::queue<Addr> addrResponseQueue;
+    std::queue<WorkListItem> worklistResponseQueue;
+
+    std::queue<int> evictQueue;
 
-    int conflictAddrQueueSize;
-    std::queue<Addr> conflictAddrQueue;
+    EventFunctionWrapper nextMemReqEvent;
+    void processNextMemReqEvent();
 
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();

From 965a48e61fc7868cf4dfaa190ca99618f0c51d07 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 22 Mar 2022 17:31:55 -0700
Subject: [PATCH 071/247] Compiles.

---
 src/accl/graph/base/SConscript            |  2 --
 src/accl/graph/base/base_read_engine.cc   |  4 +--
 src/accl/graph/base/base_read_engine.hh   | 11 ++++----
 src/accl/graph/base/base_reduce_engine.cc |  2 +-
 src/accl/graph/base/base_reduce_engine.hh |  9 ++++---
 src/accl/graph/base/util.hh               |  5 ++++
 src/accl/graph/sega/PushEngine.py         |  2 +-
 src/accl/graph/sega/WLEngine.py           |  2 +-
 src/accl/graph/sega/coalesce_engine.cc    | 31 ++++++++++++++---------
 src/accl/graph/sega/coalesce_engine.hh    | 10 +++++---
 src/accl/graph/sega/push_engine.cc        | 24 ++++++++++++++----
 src/accl/graph/sega/push_engine.hh        |  7 +++++
 src/accl/graph/sega/wl_engine.cc          | 29 +++++++++++----------
 src/accl/graph/sega/wl_engine.hh          |  4 +--
 14 files changed, 88 insertions(+), 54 deletions(-)

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index c6a78eb5e8..8aefca2185 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -33,5 +33,3 @@ SimObject('BaseReduceEngine.py')
 Source('base_read_engine.cc')
 Source('base_reduce_engine.cc')
 Source('util.cc')
-
-DebugFlag('MPU')
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index 4192cdb565..894831429b 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/base/base_read_engine.hh"
+#include "accl/graph/base/base_read_engine.hh"
 
 namespace gem5
 {
@@ -35,7 +35,7 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
-    _requestorId(system.getRequestorId(this)),
+    _requestorId(system->getRequestorId(this))
 {}
 
 BaseReadEngine::~BaseReadEngine()
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index 99f14bcb06..956c50e47d 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_GRAPH_BASE_READ_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_READ_ENGINE_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
@@ -35,7 +35,7 @@
 #include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
-#include "params/BaseEngine.hh"
+#include "params/BaseReadEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
 
@@ -53,7 +53,7 @@ class BaseReadEngine : public ClockedObject
         PacketPtr blockedPacket;
 
         public:
-        MemPort(const std::string& name, BaseEngine* owner):
+        MemPort(const std::string& name, BaseReadEngine* owner):
             RequestPort(name, owner), owner(owner),
             _blocked(false), blockedPacket(nullptr)
         {}
@@ -69,8 +69,6 @@ class BaseReadEngine : public ClockedObject
     System* system;
     MemPort memPort;
 
-    bool handleMemResp(PacketPtr resp);
-
   protected:
     const RequestorID _requestorId;
 
@@ -85,6 +83,7 @@ class BaseReadEngine : public ClockedObject
 
     BaseReadEngine(const BaseReadEngineParams &params);
     ~BaseReadEngine();
+    
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 
diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc
index fbfc613313..82643ba3ff 100644
--- a/src/accl/graph/base/base_reduce_engine.cc
+++ b/src/accl/graph/base/base_reduce_engine.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/base/base_reduce_engine.hh"
+#include "accl/graph/base/base_reduce_engine.hh"
 
 namespace gem5
 {
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
index e44f384f26..7851eaf585 100644
--- a/src/accl/graph/base/base_reduce_engine.hh
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -26,11 +26,11 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_GRAPH_BASE_REDUCE_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_REDUCE_ENGINE_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
 
 
-#include "accl/base/util.hh"
+#include "accl/graph/base/util.hh"
 #include "params/BaseReduceEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
@@ -43,7 +43,6 @@ class BaseReduceEngine : public ClockedObject
   private:
     System* system;
 
-    bool handleIncomingWL(Addr addr, WorkListItem wl);
 
   protected:
     Addr currentWorkListAddress;
@@ -60,6 +59,8 @@ class BaseReduceEngine : public ClockedObject
     ~BaseReduceEngine();
 
     RequestorID requestorId() { return _requestorId; }
+
+    void handleIncomingWL(Addr addr, WorkListItem wl);
 };
 
 }
diff --git a/src/accl/graph/base/util.hh b/src/accl/graph/base/util.hh
index a4418a1cb8..1066d37d1c 100644
--- a/src/accl/graph/base/util.hh
+++ b/src/accl/graph/base/util.hh
@@ -26,6 +26,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#ifndef __ACCL_GRAPH_BASE_UTIL_HH__
+#define __ACCL_GRAPH_BASE_UTIL_HH__
+
 #include "base/cprintf.hh"
 #include "base/types.hh"
 #include "mem/packet.hh"
@@ -75,3 +78,5 @@ PacketPtr getUpdatePacket(Addr addr, unsigned int size,
                 uint8_t *data, RequestorID requestorId);
 
 }
+
+#endif // __ACCL_GRAPH_BASE_UTIL_HH__
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 9036b4e401..129d9454c7 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -35,6 +35,6 @@ class PushEngine(BaseReadEngine):
     cxx_class = 'gem5::PushEngine'
 
     req_port  = RequestPort("Port to send updates to the outside")
-    base_edge_addr = Param.Addr()
+    base_edge_addr = Param.Addr("")
     mem_resp_queue_size = Param.Int(0, "")
     push_req_queue_size = Param.Int(0, "")
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index ec9154b138..cab47fbe7b 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -35,6 +35,6 @@ class WLEngine(BaseReduceEngine):
     cxx_class = 'gem5::WLEngine'
 
     resp_port = ResponsePort("Port to Receive updates from outside")
-    coalesce_engine = Param.CoaleseEngine(NULL, "")
+    coalesce_engine = Param.CoalesceEngine(NULL, "")
     update_queue_size = Param.Int(0, "")
     on_the_fly_update_map_size = Param.Int(4, "") # 4 is arbitrary
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 22bc0d49a6..663559cc63 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -26,9 +26,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/sega/coalesce_engine.hh"
+#include "accl/graph/sega/coalesce_engine.hh"
 
-#include "accl/sega/wl_engine.hh"
+#include "accl/graph/sega/wl_engine.hh"
 #include "debug/MPU.hh"
 
 namespace gem5
@@ -40,12 +40,13 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
-    nextWorkListSendEvent([this] { processNextWorkListSendEvent(); }, name()),
+    nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
+    nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name())
 {}
 
-CoalesceEngine::~CoalesceEngine()
-{}
+// CoalesceEngine::~CoalesceEngine()
+// {}
 
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
@@ -86,8 +87,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 // Out of MSHR entries
                 return false;
             } else {
-                if (cacheBlock[block_index].allocated) {
-                    assert(MSHRMap[block_index].size() <= numTgtsPerMSHR)
+                if (cacheBlocks[block_index].allocated) {
+                    assert(MSHRMap[block_index].size() <= numTgtsPerMSHR);
                     if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
                         return false;
                     }
@@ -122,6 +123,10 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     return true;
                 }
             }
+        } else {
+            assert(cacheBlocks[block_index].hasConflict);
+            MSHRMap[block_index].push_back(addr);
+            return true;
         }
     }   
 }
@@ -167,12 +172,12 @@ CoalesceEngine::processNextRespondEvent()
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
-    if (pkt->isResp() && pkt->isWrite()) {
+    if (pkt->isResponse() && pkt->isWrite()) {
         return true;
     }
 
     Addr addr = pkt->getAddr();
-    uint8_t data = pkt->getPtr<uint8_t>();
+    uint8_t* data = pkt->getPtr<uint8_t>();
     int block_index = addr % 256;
 
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
@@ -218,6 +223,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         (!addrResponseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
+
+    return true;
 }
 
 void
@@ -229,7 +236,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) == 
             (1 << wl_offset));
-    cacheBlocks[block_index].item[wl_offset] = wl;
+    cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
     
     // TODO: Make this more general and programmable.
@@ -261,10 +268,10 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             changedMask |= (1 << i);
         }
         uint8_t* wl_data = workListToMemory(cacheBlocks[block_index].items[i]);
-        std::memcpy(data[i * 16], wl_data, sizeof(WorkListItem));
+        std::memcpy(data + (i * 16), wl_data, sizeof(WorkListItem));
     }
 
-    if (changed) {
+    if (changedMask) {
         assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
         PacketPtr write_pkt = getWritePacket(
             cacheBlocks[block_index].addr, 64, data, _requestorId);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index f5fd85e4cf..6086a8855e 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -29,8 +29,10 @@
 #ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
-#include "accl/base/base_read_engine.hh"
-#include "accl/sega/push_engine.hh"
+#include "accl/graph/base/base_read_engine.hh"
+#include "accl/graph/base/util.hh"
+#include "accl/graph/sega/push_engine.hh"
+#include "params/CoalesceEngine.hh"
 
 namespace gem5
 {
@@ -85,7 +87,7 @@ class CoalesceEngine : public BaseReadEngine
     PARAMS(CoalesceEngine);
 
     CoalesceEngine(const CoalesceEngineParams &params);
-    ~CoalesceEngine();
+    // ~CoalesceEngine();
 
     void recvFunctional(PacketPtr pkt);
 
@@ -93,7 +95,7 @@ class CoalesceEngine : public BaseReadEngine
     void recvWLWrite(Addr addr, WorkListItem wl);
 
     void registerWLEngine(WLEngine* wl_engine);
-}
+};
 
 }
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c865451999..2a978cfcc5 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -28,6 +28,8 @@
 
 #include "accl/graph/sega/push_engine.hh"
 
+#include "debug/MPU.hh"
+
 namespace gem5
 {
 
@@ -35,8 +37,8 @@ PushEngine::PushEngine(const PushEngineParams &params):
     BaseReadEngine(params),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
-    memRespQueueSize(params.mem_resp_queue_size),
     pushReqQueueSize(params.push_req_queue_size),
+    memRespQueueSize(params.mem_resp_queue_size),
     onTheFlyReadReqs(0),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
     nextReadEvent([this] { processNextReadEvent(); }, name()),
@@ -87,7 +89,7 @@ PushEngine::ReqPort::recvReqRetry()
 }
 
 bool
-PushEngine::recvWLItem(WorkListItem wl);
+PushEngine::recvWLItem(WorkListItem wl)
 {
     assert(pushReqQueue.size() <= pushReqQueueSize);
     if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
@@ -133,14 +135,14 @@ PushEngine::processNextAddrGenEvent()
     };
 
     for (int index = 0; index < addr_queue.size(); index++) {
-        PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
+        PacketPtr pkt = getReadPacket(addr_queue[index], 64, _requestorId);
         reqOffsetMap[pkt->req] = offset_queue[index];
         reqNumEdgeMap[pkt->req] = num_edge_queue[index];
         reqValueMap[pkt->req] = wl.prop;
         pendingReadReqs.push(pkt);
     }
 
-    pushReadReqs.pop();
+    pushReqQueue.pop();
 
     if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
         schedule(nextAddrGenEvent, nextCycle());
@@ -176,6 +178,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
         schedule(nextPushEvent, nextCycle());
     }
+    return true;
 }
 
 void
@@ -199,7 +202,8 @@ PushEngine::processNextPushEvent()
         *update_data = value + 1;
         PacketPtr update = getUpdatePacket(e.neighbor,
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
-            requestorId);
+            _requestorId);
+
         if (sendPushUpdate(update) && (i == num_edges - 1)) {
             memRespQueue.pop();
             DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
@@ -213,4 +217,14 @@ PushEngine::processNextPushEvent()
     }
 }
 
+bool
+PushEngine::sendPushUpdate(PacketPtr pkt)
+{
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(pkt);
+        return true;
+    }
+    return false;
+}
+
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index bf645eb119..e97a26c7bd 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -30,6 +30,7 @@
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
 #include "accl/graph/base/base_read_engine.hh"
+#include "accl/graph/base/util.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
@@ -65,6 +66,10 @@ class PushEngine : public BaseReadEngine
     int pushReqQueueSize;
     std::queue<WorkListItem> pushReqQueue;
 
+    std::unordered_map<RequestPtr, Addr> reqOffsetMap;
+    std::unordered_map<RequestPtr, int> reqNumEdgeMap;
+    std::unordered_map<RequestPtr, uint32_t> reqValueMap;
+
     // TODO: Possibility of infinite queueing
     std::queue<PacketPtr> pendingReadReqs;
 
@@ -72,6 +77,8 @@ class PushEngine : public BaseReadEngine
     int onTheFlyReadReqs;
     std::queue<PacketPtr> memRespQueue;
 
+    bool sendPushUpdate(PacketPtr pkt);
+
     EventFunctionWrapper nextAddrGenEvent;
     void processNextAddrGenEvent();
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index f0c522ff6f..43ad112db3 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -36,13 +36,13 @@ WLEngine::WLEngine(const WLEngineParams &params):
     BaseReduceEngine(params),
     respPort(name() + ".resp_port", this),
     blockedByCoalescer(false),
-    coaleseEngine(params.coalesce_engine),
+    coalesceEngine(params.coalesce_engine),
     updateQueueSize(params.update_queue_size),
     onTheFlyUpdateMapSize(params.on_the_fly_update_map_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name())
 {
-    coaleseEngine->registerWLEngine(this);
+    coalesceEngine->registerWLEngine(this);
 }
 
 Port&
@@ -82,14 +82,14 @@ WLEngine::startup()
         uint8_t* data = workListToMemory(vertices[i]);
         PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
                                         16, data, 0);
-        sendMemFunctional(pkt);
+        coalesceEngine->recvFunctional(pkt);
     }
 
     for (int i = 0; i < 7; i++) {
         uint8_t* data = edgeToMemory(edges[i]);
         PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
                                         16, data, 0);
-        sendMemFunctional(pkt);
+        coalesceEngine->recvFunctional(pkt);
     }
 
     uint8_t* first_update_data = new uint8_t [4];
@@ -97,9 +97,9 @@ WLEngine::startup()
     *tempPtr = 0;
 
     PacketPtr first_update = getUpdatePacket(
-        0, 4, first_update_data, requestorId);
+        0, 4, first_update_data, _requestorId);
 
-    handleWLUpdate(first_update);
+    handleIncomingUpdate(first_update);
 }
 
 AddrRangeList
@@ -135,13 +135,13 @@ WLEngine::RespPort::recvRespRetry()
 void
 WLEngine::recvFunctional(PacketPtr pkt)
 {
-    coaleseEngine->recvFunctional(pkt);
+    coalesceEngine->recvFunctional(pkt);
 }
 
 AddrRangeList
-WLEngine::getAddrRanges()
+WLEngine::getAddrRanges() const
 {
-    return coaleseEngine->getAddrRanges();
+    return coalesceEngine->getAddrRanges();
 }
 
 void
@@ -149,18 +149,18 @@ WLEngine::processNextReadEvent()
 {
     PacketPtr update = updateQueue.front();
     Addr update_addr = update->getAddr();
-    uint32_t update_value = update->getPtr<uint32_t>();
+    uint32_t* update_value = update->getPtr<uint32_t>();
 
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) &&
         (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) {
         if (coalesceEngine->recvReadAddr(update_addr)) {
-            onTheFlyUpdateMap[update_addr] = update_value
+            onTheFlyUpdateMap[update_addr] = *update_value;
             updateQueue.pop();
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
         onTheFlyUpdateMap[update_addr] =
-                min(update_addr, onTheFlyUpdateMap[update_addr]);
+                std::min(*update_value, onTheFlyUpdateMap[update_addr]);
         updateQueue.pop();
         // TODO: Add a stat to count the number of coalescions
     }
@@ -176,8 +176,9 @@ void
 WLEngine::processNextReduceEvent()
 {
     // TODO: Generalize this to reduce function rather than just min
-    currentWorkList.temp_prop = min(onTheFlyUpdateMap[currentWorkListAddress],
-                                    currentWorkList.temp_prop);
+    currentWorkList.temp_prop = std::min(
+                                onTheFlyUpdateMap[currentWorkListAddress],
+                                currentWorkList.temp_prop);
     // TODO: Add a delay here
     coalesceEngine->recvWLWrite(currentWorkListAddress, currentWorkList);
 
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 1846825951..3ce01dd69d 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -63,7 +63,7 @@ class WLEngine : public BaseReduceEngine
     RespPort respPort;
 
     bool blockedByCoalescer;
-    CoalesceEngine* coaleseEngine;
+    CoalesceEngine* coalesceEngine;
 
     int updateQueueSize;
     std::queue<PacketPtr> updateQueue;
@@ -84,7 +84,7 @@ class WLEngine : public BaseReduceEngine
     void processNextReduceEvent();
 
   protected:
-    virtual void scheduleReduceEvent() = 0;
+    virtual void scheduleReduceEvent();
 
   public:
     PARAMS(WLEngine);

From df5706a46ff4b39293a26c4b3c06dc7aee1aa2d5 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Mar 2022 00:34:29 -0700
Subject: [PATCH 072/247] Debugging after compilation. Loop writting to mem

---
 configs/accl/sega.py                      | 28 +++++---
 src/accl/graph/base/base_reduce_engine.cc |  8 ---
 src/accl/graph/base/base_reduce_engine.hh |  4 +-
 src/accl/graph/sega/coalesce_engine.cc    | 83 +++++++++++++++--------
 src/accl/graph/sega/coalesce_engine.hh    |  4 +-
 src/accl/graph/sega/push_engine.cc        |  5 +-
 src/accl/graph/sega/wl_engine.cc          | 60 +++++++++++-----
 src/accl/graph/sega/wl_engine.hh          |  6 +-
 8 files changed, 126 insertions(+), 72 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 163ea169d9..f71b0e73e0 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -4,15 +4,12 @@
 class MPU(SubSystem):
     def __init__(self):
         super(MPU, self).__init__()
-        self.lock_dir = LockDirectory()
-        self.push_engine = PushEngine()
-        self.apply_engine = ApplyEngine(push_engine = self.push_engine, lock_dir = self.lock_dir)
-        self.wl_engine = WLEngine(apply_engine = self.apply_engine, lock_dir = self.lock_dir)
+        self.push_engine = PushEngine(base_edge_addr=0x100000, push_req_queue_size = 16)
+        self.coalesce_engine = CoalesceEngine(peer_push_engine=self.push_engine)
+        self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, update_queue_size = 16, on_the_fly_update_map_size=8)
         self.interconnect = SystemXBar()
 
-
-        self.interconnect.cpu_side_ports = self.wl_engine.mem_port
-        self.interconnect.cpu_side_ports = self.apply_engine.mem_port
+        self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port
         self.interconnect.cpu_side_ports = self.push_engine.mem_port
 
     def getRespPort(self):
@@ -30,6 +27,16 @@ def getMemPort(self):
     def setMemPort(self, port):
         self.interconnect.mem_side_ports = port
 
+    def getVertexMemPort(self):
+        return self.coalesce_engine.mem_port
+    def setVertexMemPort(self, port):
+        self.coalesce_engine.mem_port = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
 class SEGA(System):
     def __init__(self):
         super(SEGA, self).__init__()
@@ -40,8 +47,9 @@ def __init__(self):
 
         self.mpu = MPU()
         self.mem_ctrl = SimpleMemory(range=AddrRange("4GiB"), bandwidth="1000GB/s", latency = "30ns")
-        # self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB")))
-
+        # self.mem_ctrl = MemCtrl()
+        # self.mem_ctrl.dram = DDR4_2400_8x8(range=AddrRange(start=0x000000, size="1MiB"))
+        # self.mem_ctrl.nvm = NVM_2400_1x64(range=AddrRange(start=0x100000, size="1MiB"))
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setMemPort(self.mem_ctrl.port)
 
@@ -50,6 +58,6 @@ def __init__(self):
 
 m5.instantiate()
 
-exit_event = m5.simulate(1000000)
+exit_event = m5.simulate()
 print("Simulation finished!")
 exit()
diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc
index 82643ba3ff..38a8662ed0 100644
--- a/src/accl/graph/base/base_reduce_engine.cc
+++ b/src/accl/graph/base/base_reduce_engine.cc
@@ -40,12 +40,4 @@ BaseReduceEngine::BaseReduceEngine(const BaseReduceEngineParams &params):
 BaseReduceEngine::~BaseReduceEngine()
 {}
 
-void
-BaseReduceEngine::handleIncomingWL(Addr addr, WorkListItem wl)
-{
-    currentWorkListAddress = addr;
-    currentWorkList = wl;
-    scheduleReduceEvent();
-}
-
 }
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
index 7851eaf585..64d6e4c8c0 100644
--- a/src/accl/graph/base/base_reduce_engine.hh
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -50,8 +50,6 @@ class BaseReduceEngine : public ClockedObject
 
     const RequestorID _requestorId;
 
-    virtual void scheduleReduceEvent() = 0;
-
   public:
     PARAMS(BaseReduceEngine);
 
@@ -60,7 +58,7 @@ class BaseReduceEngine : public ClockedObject
 
     RequestorID requestorId() { return _requestorId; }
 
-    void handleIncomingWL(Addr addr, WorkListItem wl);
+    virtual void handleIncomingWL(Addr addr, WorkListItem wl) = 0;
 };
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 663559cc63..aa6bc99887 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -45,8 +45,16 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name())
 {}
 
-// CoalesceEngine::~CoalesceEngine()
-// {}
+void
+CoalesceEngine::startup()
+{
+    for (int i = 0; i < 256; i++) {
+        cacheBlocks[i].takenMask = 0;
+        cacheBlocks[i].allocated = false;
+        cacheBlocks[i].valid = false;
+        cacheBlocks[i].hasConflict = false;
+    }
+}
 
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
@@ -64,6 +72,8 @@ bool
 CoalesceEngine::recvReadAddr(Addr addr)
 {
     assert(MSHRMap.size() <= numMSHREntry);
+    DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
+                                                    __func__, addr);
     Addr alligned_addr = (addr / 64) * 64;
     int block_index = alligned_addr % 256;
     int wl_offset = (addr - alligned_addr) / 16;
@@ -71,11 +81,13 @@ CoalesceEngine::recvReadAddr(Addr addr)
     if ((cacheBlocks[block_index].addr == alligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
+        DPRINTF(MPU, "%s: Read request with addr: %lu hit in the cache.\n"
+                        , __func__, addr);
         addrResponseQueue.push(addr);
         worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]);
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
         if ((!nextRespondEvent.scheduled()) &&
-            (!worklistResponseQueue.empty()) && 
+            (!worklistResponseQueue.empty()) &&
             (!addrResponseQueue.empty())) {
             schedule(nextRespondEvent, nextCycle());
         }
@@ -93,18 +105,26 @@ CoalesceEngine::recvReadAddr(Addr addr)
                         return false;
                     }
                     // MSHR available but conflict
+                    DPRINTF(MPU, "%s: Read request with addr: %lu missed with "
+                                "conflict. Making a request for "
+                                "alligned_addr: %lu.\n",
+                                __func__, addr, alligned_addr);
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
                     return true;
                 } else {
                     // MSHR available and no conflict
                     assert(
-                        outstandingMemReqQueue.size() <= 
+                        outstandingMemReqQueue.size() <=
                         outstandingMemReqQueueSize);
-                    if (outstandingMemReqQueue.size() == 
+                    if (outstandingMemReqQueue.size() ==
                         outstandingMemReqQueueSize) {
                         return false;
                     }
+                    DPRINTF(MPU, "%s: Read request with addr: "
+                                "%lu missed with no conflict. "
+                                "Making a request for alligned_addr: %lu.\n"
+                                , __func__, addr, alligned_addr);
                     cacheBlocks[block_index].addr = alligned_addr;
                     cacheBlocks[block_index].takenMask = 0;
                     cacheBlocks[block_index].allocated = true;
@@ -112,7 +132,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     cacheBlocks[block_index].hasConflict = false;
 
                     MSHRMap[block_index].push_back(addr);
-                    PacketPtr pkt = getReadPacket(alligned_addr, 
+                    PacketPtr pkt = getReadPacket(alligned_addr,
                                                 64, _requestorId);
                     outstandingMemReqQueue.push(pkt);
 
@@ -124,11 +144,15 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 }
             }
         } else {
-            assert(cacheBlocks[block_index].hasConflict);
+            if ((!cacheBlocks[block_index].hasConflict) &&
+                ((addr < cacheBlocks[block_index].addr) ||
+                (addr >= (cacheBlocks[block_index].addr + 64)))) {
+                cacheBlocks[block_index].hasConflict = true;
+            }
             MSHRMap[block_index].push_back(addr);
             return true;
         }
-    }   
+    }
 }
 
 void
@@ -143,7 +167,7 @@ CoalesceEngine::processNextMemReqEvent()
 
     if ((!nextMemReqEvent.scheduled()) &&
         (!outstandingMemReqQueue.empty())) {
-        schedule(nextMemReqEvent, nextCycle()); 
+        schedule(nextMemReqEvent, nextCycle());
     }
 }
 
@@ -152,23 +176,19 @@ CoalesceEngine::processNextRespondEvent()
 {
     Addr addr_response = addrResponseQueue.front();
     WorkListItem worklist_response = worklistResponseQueue.front();
-    
+
     peerWLEngine->handleIncomingWL(addr_response, worklist_response);
 
     addrResponseQueue.pop();
     worklistResponseQueue.pop();
 
     if ((!nextRespondEvent.scheduled()) &&
-        (!worklistResponseQueue.empty()) && 
+        (!worklistResponseQueue.empty()) &&
         (!addrResponseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
 }
 
-/*
-    void recvWLWrite(Addr addr, WorkListItem wl);
-*/
-
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
@@ -183,11 +203,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
             (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
-    cacheBlocks[block_index].valid = true;
 
     for (int i = 0; i < 4; i++) {
         cacheBlocks[block_index].items[i] = memoryToWorkList(data + (i * 16));
     }
+    cacheBlocks[block_index].valid = true;
 
     int bias = 0;
     std::vector<int> servicedIndices;
@@ -201,12 +221,12 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             worklistResponseQueue.push(
                 cacheBlocks[block_index].items[wl_offset]);
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
-            servicedIndices.push_back(i);    
+            servicedIndices.push_back(i);
         }
     }
     // TODO: We Can use taken instead of this
     for (int i = 0; i < servicedIndices.size(); i++) {
-        MSHRMap[block_index].erase(MSHRMap[block_index].begin() + 
+        MSHRMap[block_index].erase(MSHRMap[block_index].begin() +
                                     servicedIndices[i] - bias);
         bias++;
     }
@@ -219,7 +239,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     }
 
     if ((!nextRespondEvent.scheduled()) &&
-        (!worklistResponseQueue.empty()) && 
+        (!worklistResponseQueue.empty()) &&
         (!addrResponseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
@@ -233,12 +253,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     Addr alligned_addr = (addr / 64) * 64;
     int block_index = alligned_addr % 256;
     int wl_offset = (addr - alligned_addr) / 16;
-
-    assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) == 
+    DPRINTF(MPU, "%s: Recieved a WorkList write. addr: %lu, wl: %s.\n",
+                                    __func__, addr, wl.to_string());
+    DPRINTF(MPU, "%s: alligned_addr: %lu, block_index: %d, wl_offset: %d, "
+            "takenMask: %u.\n", __func__, alligned_addr,
+            block_index, wl_offset, cacheBlocks[block_index].takenMask);
+    assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) ==
             (1 << wl_offset));
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
-    
+
     // TODO: Make this more general and programmable.
     // && (cacheBlocks[block_index].hasConflict)
     if ((cacheBlocks[block_index].takenMask == 0)) {
@@ -267,6 +291,9 @@ CoalesceEngine::processNextApplyAndCommitEvent()
         if (old_prop != cacheBlocks[block_index].items[i].prop) {
             changedMask |= (1 << i);
         }
+        DPRINTF(MPU, "%s: Writing WorkListItem[%lu[%d]] to memory. "
+                    "WLItem: %s.\n", __func__, cacheBlocks[block_index].addr,
+                    i, cacheBlocks[block_index].items[i].to_string());
         uint8_t* wl_data = workListToMemory(cacheBlocks[block_index].items[i]);
         std::memcpy(data + (i * 16), wl_data, sizeof(WorkListItem));
     }
@@ -275,7 +302,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
         assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
         PacketPtr write_pkt = getWritePacket(
             cacheBlocks[block_index].addr, 64, data, _requestorId);
-        
+
         if ((cacheBlocks[block_index].hasConflict) &&
             (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)){
             Addr miss_addr = MSHRMap[block_index][0];
@@ -304,7 +331,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             cacheBlocks[block_index].hasConflict = true;
             evictQueue.pop();
         } else if ((!cacheBlocks[block_index].hasConflict) &&
-            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) { 
+            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
             outstandingMemReqQueue.push(write_pkt);
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
@@ -325,16 +352,16 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             cacheBlocks[block_index].hasConflict = false;
             evictQueue.pop();
         } else {
-            DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" , 
+            DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" ,
                 __func__);
         }
     }
-    
+
     if ((!nextMemReqEvent.scheduled()) &&
         (!outstandingMemReqQueue.empty())) {
-        schedule(nextMemReqEvent, nextCycle()); 
+        schedule(nextMemReqEvent, nextCycle());
     }
-    
+
     if ((!nextApplyAndCommitEvent.scheduled()) &&
         (!evictQueue.empty())) {
         schedule(nextApplyAndCommitEvent, nextCycle());
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 6086a8855e..6dc7bc1001 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -56,7 +56,7 @@ class CoalesceEngine : public BaseReadEngine
 
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
-    
+
     Block cacheBlocks[256];
 
     int numMSHREntry;
@@ -71,6 +71,8 @@ class CoalesceEngine : public BaseReadEngine
 
     std::queue<int> evictQueue;
 
+    virtual void startup();
+
     EventFunctionWrapper nextMemReqEvent;
     void processNextMemReqEvent();
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 2a978cfcc5..06b5381641 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -95,6 +95,7 @@ PushEngine::recvWLItem(WorkListItem wl)
     if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
         return false;
     }
+
     pushReqQueue.push(wl);
 
     if ((!nextAddrGenEvent.scheduled()) &&
@@ -204,10 +205,10 @@ PushEngine::processNextPushEvent()
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
             _requestorId);
 
+        DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
+                , __func__, e.to_string(), *update_data);
         if (sendPushUpdate(update) && (i == num_edges - 1)) {
             memRespQueue.pop();
-            DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
-                , __func__, e.to_string(), *update_data);
             // TODO: Erase map entries here.
         }
     }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 43ad112db3..b7f59987cb 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -154,45 +154,70 @@ WLEngine::processNextReadEvent()
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) &&
         (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) {
         if (coalesceEngine->recvReadAddr(update_addr)) {
+            DPRINTF(MPU, "%s: Received an update and it's not been pulled in. "
+                            "update_addr: %lu, update_value: %u.\n",
+                            __func__, update_addr, *update_value);
             onTheFlyUpdateMap[update_addr] = *update_value;
+            DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
+                __func__, update_addr, onTheFlyUpdateMap[update_addr]);
             updateQueue.pop();
+            DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
+        DPRINTF(MPU, "%s: Hitting in the onTheFlyUpdateMap."
+                            "update_addr: %lu, update_value: %u, old_value: %u.\n",
+                            __func__, update_addr, *update_value,
+                            onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
                 std::min(*update_value, onTheFlyUpdateMap[update_addr]);
         updateQueue.pop();
+        DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
         // TODO: Add a stat to count the number of coalescions
     }
 
     if ((!nextReadEvent.scheduled()) &&
-        ((!updateQueue.empty()) ||
-        (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize))) {
+        (!updateQueue.empty())) {
         schedule(nextReadEvent, nextCycle());
     }
 }
 
 void
-WLEngine::processNextReduceEvent()
+WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 {
-    // TODO: Generalize this to reduce function rather than just min
-    currentWorkList.temp_prop = std::min(
-                                onTheFlyUpdateMap[currentWorkListAddress],
-                                currentWorkList.temp_prop);
-    // TODO: Add a delay here
-    coalesceEngine->recvWLWrite(currentWorkListAddress, currentWorkList);
-
-    onTheFlyUpdateMap.erase(currentWorkListAddress);
-    currentWorkListAddress = 0;
-    currentWorkList = {0, 0, 0, 0};
+    assert(addrWorkListMap.size() <= onTheFlyUpdateMapSize);
+    addrWorkListMap[addr] = wl;
+    // TODO: Add checks to see if scheduling is necessary or correct.
+    if ((!nextReduceEvent.scheduled()) && (!addrWorkListMap.empty())) {
+        schedule(nextReduceEvent, nextCycle());
+    }
 }
 
 void
-WLEngine::scheduleReduceEvent()
+WLEngine::processNextReduceEvent()
 {
-    // TODO: Add checks to see if scheduling is necessary or correct.
-    if (!nextReduceEvent.scheduled()) {
-        schedule(nextReduceEvent, nextCycle());
+
+    std::unordered_map<Addr, WorkListItem>::iterator it =
+                    addrWorkListMap.begin();
+
+    std::vector<Addr> servicedAddresses;
+    while (it != addrWorkListMap.end()) {
+        Addr addr = it->first;
+        WorkListItem wl = it->second;
+        uint32_t update_value = onTheFlyUpdateMap[addr];
+        DPRINTF(MPU, "%s: updating WorkList[%lu] with the current temp_prop: "
+                    "%d, with new update: %d.\n", __func__, addr, wl.temp_prop,
+                    onTheFlyUpdateMap[addr]);
+        // TODO: Generalize this to reduce function rather than just min
+        wl.temp_prop = std::min(update_value, wl.temp_prop);
+        coalesceEngine->recvWLWrite(addr, wl);
+        servicedAddresses.push_back(addr);
+        it++;
+    }
+
+    addrWorkListMap.clear();
+    for (int i = 0; i < servicedAddresses.size(); i++) {
+        onTheFlyUpdateMap.erase(servicedAddresses[i]);
     }
 }
 
@@ -206,6 +231,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     }
 
     updateQueue.push(pkt);
+    DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
     if ((!nextReadEvent.scheduled()) &&
         (!updateQueue.empty())) {
         schedule(nextReadEvent, nextCycle());
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 3ce01dd69d..1ccb13d91e 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -71,6 +71,7 @@ class WLEngine : public BaseReduceEngine
     int onTheFlyUpdateMapSize;
     std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;
 
+    std::unordered_map<Addr, WorkListItem> addrWorkListMap;
     virtual void startup();
 
     void recvFunctional(PacketPtr pkt);
@@ -83,9 +84,6 @@ class WLEngine : public BaseReduceEngine
     EventFunctionWrapper nextReduceEvent;
     void processNextReduceEvent();
 
-  protected:
-    virtual void scheduleReduceEvent();
-
   public:
     PARAMS(WLEngine);
 
@@ -95,6 +93,8 @@ class WLEngine : public BaseReduceEngine
                   PortID idx=InvalidPortID) override;
 
     bool handleIncomingUpdate(PacketPtr pkt);
+
+    virtual void handleIncomingWL(Addr addr, WorkListItem wl);
 };
 
 }

From ca2f0692bf3cf8fcd4b4459e1b352c6d795b95b0 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Mar 2022 00:51:48 -0700
Subject: [PATCH 073/247] Correctness tested with small graph.

---
 src/accl/graph/sega/coalesce_engine.cc | 23 ++++++++++++++++++++---
 src/accl/graph/sega/coalesce_engine.hh |  2 +-
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index aa6bc99887..62062116c2 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -265,8 +265,19 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     // TODO: Make this more general and programmable.
     // && (cacheBlocks[block_index].hasConflict)
+    bool found = false;
     if ((cacheBlocks[block_index].takenMask == 0)) {
-        evictQueue.push(block_index);
+        for (auto index : evictQueue) {
+            if (block_index == index) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            evictQueue.push_back(block_index);
+        }
+        DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
+                __func__, evictQueue.size());
     }
 
     if ((!nextApplyAndCommitEvent.scheduled()) &&
@@ -329,7 +340,9 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             cacheBlocks[block_index].allocated = true;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = true;
-            evictQueue.pop();
+            evictQueue.pop_front();
+            DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
+                __func__, evictQueue.size());
         } else if ((!cacheBlocks[block_index].hasConflict) &&
             (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
             outstandingMemReqQueue.push(write_pkt);
@@ -350,11 +363,15 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             cacheBlocks[block_index].allocated = false;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = false;
-            evictQueue.pop();
+            evictQueue.pop_front();
+            DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
+                __func__, evictQueue.size());
         } else {
             DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" ,
                 __func__);
         }
+    } else {
+        evictQueue.pop_front();
     }
 
     if ((!nextMemReqEvent.scheduled()) &&
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 6dc7bc1001..3290f646f4 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -69,7 +69,7 @@ class CoalesceEngine : public BaseReadEngine
     std::queue<Addr> addrResponseQueue;
     std::queue<WorkListItem> worklistResponseQueue;
 
-    std::queue<int> evictQueue;
+    std::deque<int> evictQueue;
 
     virtual void startup();
 

From 358c8e6e9e0a59f7a5a3d6f780e47b559d3e524e Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 23 Mar 2022 09:53:26 -0700
Subject: [PATCH 074/247] Added performance statistics.

---
 src/accl/graph/sega/coalesce_engine.cc | 32 +++++++++++++++++++++++++-
 src/accl/graph/sega/coalesce_engine.hh | 18 +++++++++++++++
 src/accl/graph/sega/wl_engine.cc       | 22 +++++++++++++++++-
 src/accl/graph/sega/wl_engine.hh       | 15 ++++++++++++
 4 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 62062116c2..d58a36188e 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -42,7 +42,8 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
-    nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name())
+    nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
+    stats(*this)
 {}
 
 void
@@ -86,6 +87,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
         addrResponseQueue.push(addr);
         worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]);
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+        stats.readHits++;
+        stats.numVertexReads++;
         if ((!nextRespondEvent.scheduled()) &&
             (!worklistResponseQueue.empty()) &&
             (!addrResponseQueue.empty())) {
@@ -138,6 +141,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
 
                     if ((!nextMemReqEvent.scheduled()) &&
                         (!outstandingMemReqQueue.empty())) {
+                        stats.numVertexBlockReads++;
                         schedule(nextMemReqEvent, nextCycle());
                     }
                     return true;
@@ -221,6 +225,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             worklistResponseQueue.push(
                 cacheBlocks[block_index].items[wl_offset]);
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+            stats.numVertexReads++;
             servicedIndices.push_back(i);
         }
     }
@@ -262,6 +267,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
             (1 << wl_offset));
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
+    stats.numVertexWrites++;
 
     // TODO: Make this more general and programmable.
     // && (cacheBlocks[block_index].hasConflict)
@@ -376,6 +382,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
 
     if ((!nextMemReqEvent.scheduled()) &&
         (!outstandingMemReqQueue.empty())) {
+        stats.numVertexBlockWrites++;
         schedule(nextMemReqEvent, nextCycle());
     }
 
@@ -385,4 +392,27 @@ CoalesceEngine::processNextApplyAndCommitEvent()
     }
 }
 
+CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
+    : statistics::Group(&_coalesce),
+    coalesce(_coalesce),
+
+    ADD_STAT(numVertexBlockReads, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(numVertexBlockWrites, statistics::units::Count::get(),
+             "Number of memory blocks writes for vertecies"),
+    ADD_STAT(numVertexReads, statistics::units::Count::get(),
+             "Number of memory vertecies read from cache."),
+    ADD_STAT(numVertexWrites, statistics::units::Count::get(),
+             "Number of memory vertecies written to cache."),
+    ADD_STAT(readHits, statistics::units::Count::get(),
+             "Number of cache hits.")
+{
+}
+
+void
+CoalesceEngine::CoalesceStats::regStats()
+{
+    using namespace statistics;
+}
+
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 3290f646f4..d45fffa3aa 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -32,6 +32,7 @@
 #include "accl/graph/base/base_read_engine.hh"
 #include "accl/graph/base/util.hh"
 #include "accl/graph/sega/push_engine.hh"
+#include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
 namespace gem5
@@ -82,6 +83,23 @@ class CoalesceEngine : public BaseReadEngine
     EventFunctionWrapper nextApplyAndCommitEvent;
     void processNextApplyAndCommitEvent();
 
+    struct CoalesceStats : public statistics::Group
+    {
+      CoalesceStats(CoalesceEngine &coalesce);
+
+      void regStats() override;
+
+      CoalesceEngine &coalesce;
+
+      statistics::Scalar numVertexBlockReads;
+      statistics::Scalar numVertexBlockWrites;
+      statistics::Scalar numVertexReads;
+      statistics::Scalar numVertexWrites;
+      statistics::Scalar readHits;
+    };
+
+    CoalesceStats stats;
+
   protected:
     virtual bool handleMemResp(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index b7f59987cb..517d10ef67 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -40,7 +40,8 @@ WLEngine::WLEngine(const WLEngineParams &params):
     updateQueueSize(params.update_queue_size),
     onTheFlyUpdateMapSize(params.on_the_fly_update_map_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
-    nextReduceEvent([this]{ processNextReduceEvent(); }, name())
+    nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
+    stats(*this)
 {
     coalesceEngine->registerWLEngine(this);
 }
@@ -171,6 +172,7 @@ WLEngine::processNextReadEvent()
                             onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
                 std::min(*update_value, onTheFlyUpdateMap[update_addr]);
+        stats.onTheFlyCoalesce++;
         updateQueue.pop();
         DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
         // TODO: Add a stat to count the number of coalescions
@@ -209,6 +211,7 @@ WLEngine::processNextReduceEvent()
                     "%d, with new update: %d.\n", __func__, addr, wl.temp_prop,
                     onTheFlyUpdateMap[addr]);
         // TODO: Generalize this to reduce function rather than just min
+        stats.numReduce++;
         wl.temp_prop = std::min(update_value, wl.temp_prop);
         coalesceEngine->recvWLWrite(addr, wl);
         servicedAddresses.push_back(addr);
@@ -239,4 +242,21 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     return true;
 }
 
+WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
+    : statistics::Group(&_wl),
+    wl(_wl),
+
+    ADD_STAT(numReduce, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(onTheFlyCoalesce, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies")
+{
+}
+
+void
+WLEngine::WorkListStats::regStats()
+{
+    using namespace statistics;
+}
+
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 1ccb13d91e..891916e7af 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -34,6 +34,7 @@
 
 #include "accl/graph/base/base_reduce_engine.hh"
 #include "accl/graph/sega/coalesce_engine.hh"
+#include "base/statistics.hh"
 #include "params/WLEngine.hh"
 
 namespace gem5
@@ -84,6 +85,20 @@ class WLEngine : public BaseReduceEngine
     EventFunctionWrapper nextReduceEvent;
     void processNextReduceEvent();
 
+    struct WorkListStats : public statistics::Group
+    {
+      WorkListStats(WLEngine &worklist);
+
+      void regStats() override;
+
+      WLEngine &wl;
+
+      statistics::Scalar numReduce;
+      statistics::Scalar onTheFlyCoalesce;
+    };
+
+    WorkListStats stats;
+
   public:
     PARAMS(WLEngine);
 

From c6ae6a6c93f0527d83044d4b207a9507a779a1b3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 31 Mar 2022 14:10:40 -0700
Subject: [PATCH 075/247] Updating definitions for structs and removing
 unnecessary funcs.

---
 configs/accl/sega.py                      |  50 +++++---
 src/accl/graph/base/base_read_engine.cc   |  15 +++
 src/accl/graph/base/base_read_engine.hh   |   4 +-
 src/accl/graph/base/base_reduce_engine.hh |   2 -
 src/accl/graph/base/util.cc               | 145 ----------------------
 src/accl/graph/base/util.hh               |  54 ++++----
 src/accl/graph/sega/coalesce_engine.cc    |  98 ++++++++++-----
 src/accl/graph/sega/coalesce_engine.hh    |  12 +-
 src/accl/graph/sega/push_engine.cc        |  42 ++++++-
 src/accl/graph/sega/push_engine.hh        |   4 +
 src/accl/graph/sega/wl_engine.cc          |  59 ++-------
 src/accl/graph/sega/wl_engine.hh          |   1 -
 12 files changed, 201 insertions(+), 285 deletions(-)
 delete mode 100644 src/accl/graph/base/util.cc

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index f71b0e73e0..8ea247106e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -4,9 +4,13 @@
 class MPU(SubSystem):
     def __init__(self):
         super(MPU, self).__init__()
-        self.push_engine = PushEngine(base_edge_addr=0x100000, push_req_queue_size = 16)
-        self.coalesce_engine = CoalesceEngine(peer_push_engine=self.push_engine)
-        self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, update_queue_size = 16, on_the_fly_update_map_size=8)
+        self.push_engine = PushEngine(base_edge_addr=0x100000,
+                                    push_req_queue_size = 16)
+        self.coalesce_engine = CoalesceEngine(
+                                    peer_push_engine=self.push_engine)
+        self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
+                                    update_queue_size = 16,
+                                    on_the_fly_update_map_size=8)
         self.interconnect = SystemXBar()
 
         self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port
@@ -27,31 +31,41 @@ def getMemPort(self):
     def setMemPort(self, port):
         self.interconnect.mem_side_ports = port
 
-    def getVertexMemPort(self):
-        return self.coalesce_engine.mem_port
-    def setVertexMemPort(self, port):
-        self.coalesce_engine.mem_port = port
+class MPUMemory(SubSystem):
+    def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
+        super(MPUMemory, self).__init__()
+        self.vertex_mem_ctrl = SimpleMemory(
+            range=vertex_range, bandwidth="25GB/s",
+            latency="30ns", image_file=vertex_binary)
+        self.edge_mem_ctrl = SimpleMemory(
+            range=edge_range, bandwidth="25GB/s",
+            latency="30ns", image_file=edge_binary)
+        self.interconnect = SystemXBar()
+
+        self.interconnect.mem_side_ports = self.vertex_mem_ctrl.port
+        self.interconnect.mem_side_ports = self.edge_mem_ctrl.port
 
-    def getEdgeMemPort(self):
-        return self.push_engine.mem_port
-    def setEdgeMemPort(self, port):
-        self.push_engine.mem_port = port
+    def getPort(self):
+        return self.interconnect.cpu_side_ports
+    def setPort(self, port):
+        self.interconnect.cpu_side_ports = port
 
 class SEGA(System):
     def __init__(self):
         super(SEGA, self).__init__()
-
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
+        self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
 
         self.mpu = MPU()
-        self.mem_ctrl = SimpleMemory(range=AddrRange("4GiB"), bandwidth="1000GB/s", latency = "30ns")
-        # self.mem_ctrl = MemCtrl()
-        # self.mem_ctrl.dram = DDR4_2400_8x8(range=AddrRange(start=0x000000, size="1MiB"))
-        # self.mem_ctrl.nvm = NVM_2400_1x64(range=AddrRange(start=0x100000, size="1MiB"))
+        self.mem_ctrl = MPUMemory(
+            vertex_range=AddrRange(start=0x000000, size="2GiB"),
+            vertex_binary="live-journal/graph_binaries/vertices",
+            edge_range=AddrRange(start=0x80000000, size="2GiB"),
+            edge_binary="live-journal/graph_binaries/edgelist_0")
+
         self.mpu.setReqPort(self.mpu.getRespPort())
-        self.mpu.setMemPort(self.mem_ctrl.port)
+        self.mpu.setMemPort(self.mem_ctrl.getPort())
 
 system = SEGA()
 root = Root(full_system = False, system = system)
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index 894831429b..a32237db35 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -83,4 +83,19 @@ BaseReadEngine::MemPort::recvReqRetry()
     }
 }
 
+PacketPtr
+BaseReadEngine::createReadPacket(Addr addr, unsigned int size)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
+
 }
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index 956c50e47d..591b51aeb7 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -78,12 +78,14 @@ class BaseReadEngine : public ClockedObject
 
     virtual bool handleMemResp(PacketPtr pkt) = 0;
 
+    PacketPtr createReadPacket(Addr addr, unsigned int size);
+
   public:
     PARAMS(BaseReadEngine);
 
     BaseReadEngine(const BaseReadEngineParams &params);
     ~BaseReadEngine();
-    
+
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
index 64d6e4c8c0..f2245f571f 100644
--- a/src/accl/graph/base/base_reduce_engine.hh
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -45,8 +45,6 @@ class BaseReduceEngine : public ClockedObject
 
 
   protected:
-    Addr currentWorkListAddress;
-    WorkListItem currentWorkList;
 
     const RequestorID _requestorId;
 
diff --git a/src/accl/graph/base/util.cc b/src/accl/graph/base/util.cc
deleted file mode 100644
index 4172607ed0..0000000000
--- a/src/accl/graph/base/util.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2021 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/util.hh"
-
-namespace gem5
-{
-
-WorkListItem
-memoryToWorkList(uint8_t* data){
-    WorkListItem wl;
-
-    uint32_t temp_prop = *((uint32_t*) data);
-    uint32_t prop = *((uint32_t*) (data + 4));
-    uint32_t degree = *((uint32_t*) (data + 8));
-    uint32_t addr = *((uint32_t*) (data + 12));
-
-    wl  = {temp_prop, prop, degree, addr};
-    return wl;
-}
-
-uint8_t*
-workListToMemory(WorkListItem wl){
-    int  data_size = sizeof(WorkListItem) / sizeof(uint8_t);
-    uint8_t* data = new uint8_t [data_size];
-
-    uint32_t* tempPtr = (uint32_t*) data;
-    *tempPtr = wl.temp_prop;
-
-    uint32_t* propPtr = (uint32_t*) (data + 4);
-    *propPtr = wl.prop;
-
-    uint32_t* degreePtr = (uint32_t*) (data + 8);
-    *degreePtr = wl.degree;
-
-    uint32_t* edgePtr = (uint32_t*) (data + 12);
-    *edgePtr = wl.edgeIndex;
-
-    return data;
-}
-
-// Edge: (weight: 64 bits, neighbor: 64 bits)
-Edge
-memoryToEdge(uint8_t *data)
-{
-    uint64_t weight = *((uint64_t*) data);
-    Addr neighbor = *((Addr*) (data + 8)); // data + 8 because weight: 8 bytes
-    Edge e = {weight, neighbor};
-    return e;
-}
-
-// Edge: (weight: 64 bits, neighbor: 64 bits)
-uint8_t*
-edgeToMemory(Edge e)
-{
-    int data_size = (int) ((sizeof(Edge)) / (sizeof(uint8_t)));
-
-    uint8_t* data = new uint8_t [data_size];
-
-    uint64_t* weightPtr = (uint64_t*) data;
-    *weightPtr = e.weight;
-
-    Addr* neighborPtr = (Addr*) (data + 8); // data + 8 because weight: 8 bytes
-    *neighborPtr = e.neighbor;
-
-    return data;
-}
-
-PacketPtr
-getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, requestorId);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr)requestorId) << 2);
-
-    // Embed it in a packet
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-    pkt->allocate();
-
-    return pkt;
-}
-
-PacketPtr
-getWritePacket(Addr addr, unsigned int size,
-            uint8_t* data, RequestorID requestorId)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0,
-                                               requestorId);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr)requestorId) << 2);
-
-    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
-    pkt->allocate();
-    pkt->setData(data);
-
-    return pkt;
-}
-
-PacketPtr
-getUpdatePacket(Addr addr, unsigned int size,
-            uint8_t *data, RequestorID requestorId)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0,
-                                               requestorId);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr)requestorId) << 2);
-
-    // FIXME: MemCmd::UpdateWL
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-
-    pkt->allocate();
-    pkt->setData(data);
-
-    return pkt;
-}
-
-}
diff --git a/src/accl/graph/base/util.hh b/src/accl/graph/base/util.hh
index 1066d37d1c..b51a9f0781 100644
--- a/src/accl/graph/base/util.hh
+++ b/src/accl/graph/base/util.hh
@@ -30,52 +30,56 @@
 #define __ACCL_GRAPH_BASE_UTIL_HH__
 
 #include "base/cprintf.hh"
-#include "base/types.hh"
-#include "mem/packet.hh"
-#include "mem/request.hh"
 
 namespace gem5
 {
 
-struct WorkListItem
+struct __attribute__ ((packed)) WorkListItem
 {
-    uint32_t temp_prop;
-    uint32_t prop;
-    uint32_t degree;
-    uint32_t edgeIndex;
+    uint32_t tempProp : 32;
+    uint32_t prop : 32;
+    uint32_t degree : 32;
+    uint32_t edgeIndex : 32;
 
     std::string to_string()
     {
         return csprintf(
         "WorkListItem{temp_prop: %u, prop: %u, degree: %u, edgeIndex: %u}",
-        temp_prop, prop, degree, edgeIndex);
+        tempProp, prop, degree, edgeIndex);
     }
 
+    WorkListItem():
+        tempProp(0),
+        prop(0),
+        degree(0),
+        edgeIndex(0)
+    {}
+
+    WorkListItem(uint32_t temp_prop, uint32_t prop,
+                uint32_t degree, uint32_t edge_index):
+        tempProp(temp_prop),
+        prop(prop),
+        degree(degree),
+        edgeIndex(edge_index)
+    {}
+
 };
 
-struct Edge
+struct __attribute__ ((packed)) Edge
 {
-    uint64_t weight;
-    Addr neighbor;
+    uint16_t weight : 16;
+    uint64_t neighbor : 48;
 
     std::string to_string()
     {
         return csprintf("Edge{weight: %lu, neighbor: %lu}", weight, neighbor);
     }
-};
 
-WorkListItem memoryToWorkList(uint8_t* data);
-uint8_t* workListToMemory(WorkListItem wl);
-
-Edge memoryToEdge(uint8_t* data);
-uint8_t* edgeToMemory(Edge e);
-
-PacketPtr getReadPacket(Addr addr, unsigned int size,
-                            RequestorID requestorId);
-PacketPtr getWritePacket(Addr addr, unsigned int size,
-                uint8_t* data, RequestorID requestorId);
-PacketPtr getUpdatePacket(Addr addr, unsigned int size,
-                uint8_t *data, RequestorID requestorId);
+    Edge(uint16_t weight, uint64_t neighbor):
+        weight(weight),
+        neighbor(neighbor)
+    {}
+};
 
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index d58a36188e..67874cb9b9 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -75,29 +75,33 @@ CoalesceEngine::recvReadAddr(Addr addr)
     assert(MSHRMap.size() <= numMSHREntry);
     DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
-    Addr alligned_addr = (addr / 64) * 64;
-    int block_index = alligned_addr % 256;
-    int wl_offset = (addr - alligned_addr) / 16;
+    Addr aligned_addr = (addr / 64) * 64;
+    int block_index = aligned_addr % 256;
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
-    if ((cacheBlocks[block_index].addr == alligned_addr) &&
+    if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
         DPRINTF(MPU, "%s: Read request with addr: %lu hit in the cache.\n"
                         , __func__, addr);
+        // TODO: Make addrQueue and wlQueue into one std::pair<Addr, WL>
         addrResponseQueue.push(addr);
         worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]);
+        // TODO: Use a bitset instead of unsigned int for takenMask
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+
         stats.readHits++;
         stats.numVertexReads++;
-        if ((!nextRespondEvent.scheduled()) &&
-            (!worklistResponseQueue.empty()) &&
-            (!addrResponseQueue.empty())) {
+
+        assert(!worklistResponseQueue.empty() && !addrResponseQueue.empty());
+        if (!nextRespondEvent.scheduled()) {
             schedule(nextRespondEvent, nextCycle());
         }
         return true;
     } else {
         // miss
         if (MSHRMap.find(block_index) == MSHRMap.end()) {
+            assert(MSHRMap.size() <= numMSHREntry);
             if (MSHRMap.size() == numMSHREntry) {
                 // Out of MSHR entries
                 return false;
@@ -110,12 +114,14 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     // MSHR available but conflict
                     DPRINTF(MPU, "%s: Read request with addr: %lu missed with "
                                 "conflict. Making a request for "
-                                "alligned_addr: %lu.\n",
-                                __func__, addr, alligned_addr);
+                                "aligned_addr: %lu.\n",
+                                __func__, addr, aligned_addr);
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
                     return true;
                 } else {
+                    // TODO: Set valid to false every deallocation and
+                    // assert valid == false here.
                     // MSHR available and no conflict
                     assert(
                         outstandingMemReqQueue.size() <=
@@ -126,31 +132,34 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     }
                     DPRINTF(MPU, "%s: Read request with addr: "
                                 "%lu missed with no conflict. "
-                                "Making a request for alligned_addr: %lu.\n"
-                                , __func__, addr, alligned_addr);
-                    cacheBlocks[block_index].addr = alligned_addr;
+                                "Making a request for aligned_addr: %lu.\n"
+                                , __func__, addr, aligned_addr);
+                    cacheBlocks[block_index].addr = aligned_addr;
                     cacheBlocks[block_index].takenMask = 0;
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
 
                     MSHRMap[block_index].push_back(addr);
-                    PacketPtr pkt = getReadPacket(alligned_addr,
-                                                64, _requestorId);
+                    // TODO: Parameterize 64 to memory atom size
+                    PacketPtr pkt = createReadPacket(aligned_addr, 64);
                     outstandingMemReqQueue.push(pkt);
 
-                    if ((!nextMemReqEvent.scheduled()) &&
-                        (!outstandingMemReqQueue.empty())) {
-                        stats.numVertexBlockReads++;
+                    stats.numVertexBlockReads++;
+
+                    assert(!outstandingMemReqQueue.empty());
+                    if (!nextMemReqEvent.scheduled()) {
                         schedule(nextMemReqEvent, nextCycle());
                     }
                     return true;
                 }
             }
         } else {
+            if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
+                return false;
+            }
             if ((!cacheBlocks[block_index].hasConflict) &&
-                ((addr < cacheBlocks[block_index].addr) ||
-                (addr >= (cacheBlocks[block_index].addr + 64)))) {
+                (aligned_addr != cacheBlocks[block_index].addr)) {
                 cacheBlocks[block_index].hasConflict = true;
             }
             MSHRMap[block_index].push_back(addr);
@@ -196,20 +205,24 @@ CoalesceEngine::processNextRespondEvent()
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
-    if (pkt->isResponse() && pkt->isWrite()) {
+    assert(pkt->isResponse());
+    if (pkt->isWrite()) {
         return true;
     }
 
     Addr addr = pkt->getAddr();
     uint8_t* data = pkt->getPtr<uint8_t>();
-    int block_index = addr % 256;
+    int block_index = addr % 256; // TODO: After parameterizing the cache size
+                                  // this 256 number should change to the cache
+                                  // size parameter.
 
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
             (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
 
     for (int i = 0; i < 4; i++) {
-        cacheBlocks[block_index].items[i] = memoryToWorkList(data + (i * 16));
+        cacheBlocks[block_index].items[i] = *((WorkListItem*) (
+                                data + (i * sizeof(WorkListItem))));
     }
     cacheBlocks[block_index].valid = true;
 
@@ -252,16 +265,32 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
+PacketPtr
+CoalesceEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
 void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
-    Addr alligned_addr = (addr / 64) * 64;
-    int block_index = alligned_addr % 256;
-    int wl_offset = (addr - alligned_addr) / 16;
+    Addr aligned_addr = (addr / 64) * 64;
+    int block_index = aligned_addr % 256;
+    int wl_offset = (addr - aligned_addr) / 16;
     DPRINTF(MPU, "%s: Recieved a WorkList write. addr: %lu, wl: %s.\n",
                                     __func__, addr, wl.to_string());
-    DPRINTF(MPU, "%s: alligned_addr: %lu, block_index: %d, wl_offset: %d, "
-            "takenMask: %u.\n", __func__, alligned_addr,
+    DPRINTF(MPU, "%s: aligned_addr: %lu, block_index: %d, wl_offset: %d, "
+            "takenMask: %u.\n", __func__, aligned_addr,
             block_index, wl_offset, cacheBlocks[block_index].takenMask);
     assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) ==
             (1 << wl_offset));
@@ -298,35 +327,36 @@ CoalesceEngine::processNextApplyAndCommitEvent()
 {
     int block_index = evictQueue.front();
     uint8_t changedMask = 0;
+    // TODO: parameterize 64 to memory atom size
     uint8_t data[64];
 
     for (int i = 0; i < 4; i++) {
         uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
         cacheBlocks[block_index].items[i].prop = std::min(
             cacheBlocks[block_index].items[i].prop,
-            cacheBlocks[block_index].items[i].temp_prop);
+            cacheBlocks[block_index].items[i].tempProp);
         if (old_prop != cacheBlocks[block_index].items[i].prop) {
             changedMask |= (1 << i);
         }
         DPRINTF(MPU, "%s: Writing WorkListItem[%lu[%d]] to memory. "
                     "WLItem: %s.\n", __func__, cacheBlocks[block_index].addr,
                     i, cacheBlocks[block_index].items[i].to_string());
-        uint8_t* wl_data = workListToMemory(cacheBlocks[block_index].items[i]);
-        std::memcpy(data + (i * 16), wl_data, sizeof(WorkListItem));
+        uint8_t* wl_data = (uint8_t*) (cacheBlocks[block_index].items + i);
+        std::memcpy(data + (i * sizeof(WorkListItem)),
+                    wl_data, sizeof(WorkListItem));
     }
 
     if (changedMask) {
         assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
-        PacketPtr write_pkt = getWritePacket(
-            cacheBlocks[block_index].addr, 64, data, _requestorId);
+        PacketPtr write_pkt = createWritePacket(
+            cacheBlocks[block_index].addr, 64, data);
 
         if ((cacheBlocks[block_index].hasConflict) &&
             (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)){
             Addr miss_addr = MSHRMap[block_index][0];
             // TODO: Make sure this trick works;
             Addr alligned_miss_addr = (miss_addr / 64) * 64;
-            PacketPtr read_pkt = getReadPacket(
-                    alligned_miss_addr, 64, _requestorId);
+            PacketPtr read_pkt = createReadPacket(alligned_miss_addr, 64);
             outstandingMemReqQueue.push(write_pkt);
             outstandingMemReqQueue.push(read_pkt);
             // TODO: This should be improved
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index d45fffa3aa..4bb21676d4 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -35,6 +35,8 @@
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
+// TODO: Add parameters for size, memory atom size, type size,
+// length of items in the blocks.
 namespace gem5
 {
 
@@ -53,6 +55,13 @@ class CoalesceEngine : public BaseReadEngine
         bool hasConflict;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
+        Block():
+          addr(0),
+          takenMask(0),
+          allocated(false),
+          valid(false),
+          hasConflict(false)
+        {}
     };
 
     WLEngine* peerWLEngine;
@@ -74,6 +83,8 @@ class CoalesceEngine : public BaseReadEngine
 
     virtual void startup();
 
+    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
+
     EventFunctionWrapper nextMemReqEvent;
     void processNextMemReqEvent();
 
@@ -107,7 +118,6 @@ class CoalesceEngine : public BaseReadEngine
     PARAMS(CoalesceEngine);
 
     CoalesceEngine(const CoalesceEngineParams &params);
-    // ~CoalesceEngine();
 
     void recvFunctional(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 06b5381641..d09da113ee 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -57,6 +57,19 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+PushEngine::startup()
+{
+    uint8_t* first_update_data = new uint8_t [4];
+    uint32_t* tempPtr = (uint32_t*) first_update_data;
+    *tempPtr = 0;
+
+    PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
+
+    sendPushUpdate(first_update);
+}
+
+
 void
 PushEngine::ReqPort::sendPacket(PacketPtr pkt)
 {
@@ -136,7 +149,7 @@ PushEngine::processNextAddrGenEvent()
     };
 
     for (int index = 0; index < addr_queue.size(); index++) {
-        PacketPtr pkt = getReadPacket(addr_queue[index], 64, _requestorId);
+        PacketPtr pkt = createReadPacket(addr_queue[index], 64);
         reqOffsetMap[pkt->req] = offset_queue[index];
         reqNumEdgeMap[pkt->req] = num_edge_queue[index];
         reqValueMap[pkt->req] = wl.prop;
@@ -182,6 +195,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
+// FIXME: FIX THIS FUNCTION FOR TIMING AND FUNCTIONAL ACCURACY.
 void
 PushEngine::processNextPushEvent()
 {
@@ -196,17 +210,16 @@ PushEngine::processNextPushEvent()
     int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
     for (int i = 0; i < num_edges; i++) {
         uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
-        Edge e = memoryToEdge(curr_edge_data);
+        Edge* e = (Edge*) (curr_edge_data);
         int data_size = sizeof(uint32_t) / sizeof(uint8_t);
         uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
         // TODO: Implement propagate function here
         *update_data = value + 1;
-        PacketPtr update = getUpdatePacket(e.neighbor,
-            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
-            _requestorId);
+        PacketPtr update = createUpdatePacket(e->neighbor,
+            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
 
         DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
-                , __func__, e.to_string(), *update_data);
+                , __func__, e->to_string(), *update_data);
         if (sendPushUpdate(update) && (i == num_edges - 1)) {
             memRespQueue.pop();
             // TODO: Erase map entries here.
@@ -218,6 +231,23 @@ PushEngine::processNextPushEvent()
     }
 }
 
+PacketPtr
+PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    // FIXME: MemCmd::UpdateWL
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
 bool
 PushEngine::sendPushUpdate(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index e97a26c7bd..81acc9862b 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -77,6 +77,10 @@ class PushEngine : public BaseReadEngine
     int onTheFlyReadReqs;
     std::queue<PacketPtr> memRespQueue;
 
+    virtual void startup();
+
+    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
+
     bool sendPushUpdate(PacketPtr pkt);
 
     EventFunctionWrapper nextAddrGenEvent;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 517d10ef67..b874ec65ec 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -56,53 +56,6 @@ WLEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
-void
-WLEngine::startup()
-{
-    //FIXME: This is the current version of our initializer.
-    // This should be updated in the future.
-    //FIXME: The WLEngine no longer has a MemPort. Update this to
-    // work with the CoalesceEngine instead.
-    WorkListItem vertices [5] = {
-                                {10000, 10000, 3, 0}, // Addr: 0
-                                {10000, 10000, 1, 3}, // Addr: 16
-                                {10000, 10000, 1, 4}, // Addr: 32
-                                {10000, 10000, 1, 5}, // Addr: 48
-                                {10000, 10000, 0, 6}  // Addr: 64
-                                };
-    Edge edges [7] = {
-                    {0, 16}, // Addr: 1048576
-                    {0, 32}, // Addr: 1048592
-                    {0, 48}, // Addr: 1048608
-                    {0, 32}, // Addr: 1048624
-                    {0, 64},  // Addr: 1048640
-                    {0, 32}
-                    };
-
-    for (int i = 0; i < 5; i++) {
-        uint8_t* data = workListToMemory(vertices[i]);
-        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
-                                        16, data, 0);
-        coalesceEngine->recvFunctional(pkt);
-    }
-
-    for (int i = 0; i < 7; i++) {
-        uint8_t* data = edgeToMemory(edges[i]);
-        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
-                                        16, data, 0);
-        coalesceEngine->recvFunctional(pkt);
-    }
-
-    uint8_t* first_update_data = new uint8_t [4];
-    uint32_t* tempPtr = (uint32_t*) first_update_data;
-    *tempPtr = 0;
-
-    PacketPtr first_update = getUpdatePacket(
-        0, 4, first_update_data, _requestorId);
-
-    handleIncomingUpdate(first_update);
-}
-
 AddrRangeList
 WLEngine::RespPort::getAddrRanges() const
 {
@@ -152,6 +105,7 @@ WLEngine::processNextReadEvent()
     Addr update_addr = update->getAddr();
     uint32_t* update_value = update->getPtr<uint32_t>();
 
+    // FIXME: else logic is wrong
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) &&
         (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) {
         if (coalesceEngine->recvReadAddr(update_addr)) {
@@ -178,6 +132,7 @@ WLEngine::processNextReadEvent()
         // TODO: Add a stat to count the number of coalescions
     }
 
+    // TODO: Only schedule nextReadEvent only when it has to be scheduled
     if ((!nextReadEvent.scheduled()) &&
         (!updateQueue.empty())) {
         schedule(nextReadEvent, nextCycle());
@@ -208,11 +163,12 @@ WLEngine::processNextReduceEvent()
         WorkListItem wl = it->second;
         uint32_t update_value = onTheFlyUpdateMap[addr];
         DPRINTF(MPU, "%s: updating WorkList[%lu] with the current temp_prop: "
-                    "%d, with new update: %d.\n", __func__, addr, wl.temp_prop,
+                    "%d, with new update: %d.\n", __func__, addr, wl.tempProp,
                     onTheFlyUpdateMap[addr]);
         // TODO: Generalize this to reduce function rather than just min
+        wl.tempProp = std::min(update_value, wl.tempProp);
         stats.numReduce++;
-        wl.temp_prop = std::min(update_value, wl.temp_prop);
+
         coalesceEngine->recvWLWrite(addr, wl);
         servicedAddresses.push_back(addr);
         it++;
@@ -227,16 +183,15 @@ WLEngine::processNextReduceEvent()
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
-    // TODO: Coalesce updates here too
     assert(updateQueue.size() <= updateQueueSize);
     if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
         return false;
     }
 
     updateQueue.push(pkt);
+    assert(!updateQueue.empty());
     DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
-    if ((!nextReadEvent.scheduled()) &&
-        (!updateQueue.empty())) {
+    if (!nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());
     }
     return true;
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 891916e7af..ef18956ec1 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -73,7 +73,6 @@ class WLEngine : public BaseReduceEngine
     std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;
 
     std::unordered_map<Addr, WorkListItem> addrWorkListMap;
-    virtual void startup();
 
     void recvFunctional(PacketPtr pkt);
 

From aa5a5e06804582845ae1c33732d759a1d51a3ece Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 31 Mar 2022 18:03:35 -0700
Subject: [PATCH 076/247] Fixing base_edge_addr in config and debugs.

---
 configs/accl/sega.py               |  6 +++---
 src/accl/graph/base/SConscript     |  1 -
 src/accl/graph/sega/push_engine.cc | 11 +++++++----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8ea247106e..680157ba7e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -4,7 +4,7 @@
 class MPU(SubSystem):
     def __init__(self):
         super(MPU, self).__init__()
-        self.push_engine = PushEngine(base_edge_addr=0x100000,
+        self.push_engine = PushEngine(base_edge_addr=0x80000000,
                                     push_req_queue_size = 16)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine)
@@ -60,9 +60,9 @@ def __init__(self):
         self.mpu = MPU()
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="live-journal/graph_binaries/vertices",
+            vertex_binary="epinions/graph_binaries/vertices",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="live-journal/graph_binaries/edgelist_0")
+            edge_binary="epinions/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setMemPort(self.mem_ctrl.getPort())
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 8aefca2185..ea96f4323b 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -32,4 +32,3 @@ SimObject('BaseReduceEngine.py')
 
 Source('base_read_engine.cc')
 Source('base_reduce_engine.cc')
-Source('util.cc')
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d09da113ee..c305a4bbb9 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -203,23 +203,26 @@ PushEngine::processNextPushEvent()
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
 
+    DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu.\n",
+            __func__, pkt->getAddr());
+
     Addr offset = reqOffsetMap[req];
     int num_edges = reqNumEdgeMap[req];
     uint32_t value = reqValueMap[req];
 
-    int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
     for (int i = 0; i < num_edges; i++) {
-        uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
+        uint8_t *curr_edge_data = data + offset + (i * sizeof(Edge));
         Edge* e = (Edge*) (curr_edge_data);
+        DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
         int data_size = sizeof(uint32_t) / sizeof(uint8_t);
         uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
         // TODO: Implement propagate function here
         *update_data = value + 1;
+        DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
+                __func__, e->neighbor, *update_data);
         PacketPtr update = createUpdatePacket(e->neighbor,
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
 
-        DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
-                , __func__, e->to_string(), *update_data);
         if (sendPushUpdate(update) && (i == num_edges - 1)) {
             memRespQueue.pop();
             // TODO: Erase map entries here.

From b8df760f0512d590c32826349e408cebe0e075bb Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Thu, 31 Mar 2022 19:00:29 -0700
Subject: [PATCH 077/247] Changing queue to deque

---
 src/accl/graph/base/base_read_engine.hh |  1 -
 src/accl/graph/sega/coalesce_engine.cc  | 22 +++++++++++-----------
 src/accl/graph/sega/coalesce_engine.hh  |  6 +++---
 src/accl/graph/sega/push_engine.cc      | 12 ++++++------
 src/accl/graph/sega/push_engine.hh      |  6 +++---
 src/accl/graph/sega/wl_engine.cc        |  6 +++---
 src/accl/graph/sega/wl_engine.hh        |  2 +-
 7 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index 591b51aeb7..e21aaa01d2 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -29,7 +29,6 @@
 #ifndef __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
 #define __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
 
-#include <queue>
 #include <unordered_map>
 
 #include "base/addr_range.hh"
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 67874cb9b9..9fed1e8230 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -85,8 +85,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
         DPRINTF(MPU, "%s: Read request with addr: %lu hit in the cache.\n"
                         , __func__, addr);
         // TODO: Make addrQueue and wlQueue into one std::pair<Addr, WL>
-        addrResponseQueue.push(addr);
-        worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]);
+        addrResponseQueue.push_back(addr);
+        worklistResponseQueue.push_back(cacheBlocks[block_index].items[wl_offset]);
         // TODO: Use a bitset instead of unsigned int for takenMask
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
 
@@ -143,7 +143,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     MSHRMap[block_index].push_back(addr);
                     // TODO: Parameterize 64 to memory atom size
                     PacketPtr pkt = createReadPacket(aligned_addr, 64);
-                    outstandingMemReqQueue.push(pkt);
+                    outstandingMemReqQueue.push_back(pkt);
 
                     stats.numVertexBlockReads++;
 
@@ -175,7 +175,7 @@ CoalesceEngine::processNextMemReqEvent()
 
     if (!memPortBlocked()) {
         sendMemReq(pkt);
-        outstandingMemReqQueue.pop();
+        outstandingMemReqQueue.pop_front();
     }
 
     if ((!nextMemReqEvent.scheduled()) &&
@@ -192,8 +192,8 @@ CoalesceEngine::processNextRespondEvent()
 
     peerWLEngine->handleIncomingWL(addr_response, worklist_response);
 
-    addrResponseQueue.pop();
-    worklistResponseQueue.pop();
+    addrResponseQueue.pop_front();
+    worklistResponseQueue.pop_front();
 
     if ((!nextRespondEvent.scheduled()) &&
         (!worklistResponseQueue.empty()) &&
@@ -234,8 +234,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
         if (alligned_miss_addr == addr) {
             int wl_offset = (miss_addr - alligned_miss_addr) / 16;
-            addrResponseQueue.push(miss_addr);
-            worklistResponseQueue.push(
+            addrResponseQueue.push_back(miss_addr);
+            worklistResponseQueue.push_back(
                 cacheBlocks[block_index].items[wl_offset]);
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
             stats.numVertexReads++;
@@ -357,8 +357,8 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             // TODO: Make sure this trick works;
             Addr alligned_miss_addr = (miss_addr / 64) * 64;
             PacketPtr read_pkt = createReadPacket(alligned_miss_addr, 64);
-            outstandingMemReqQueue.push(write_pkt);
-            outstandingMemReqQueue.push(read_pkt);
+            outstandingMemReqQueue.push_back(write_pkt);
+            outstandingMemReqQueue.push_back(read_pkt);
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
@@ -381,7 +381,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 __func__, evictQueue.size());
         } else if ((!cacheBlocks[block_index].hasConflict) &&
             (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
-            outstandingMemReqQueue.push(write_pkt);
+            outstandingMemReqQueue.push_back(write_pkt);
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 4bb21676d4..2cb9856f76 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -74,10 +74,10 @@ class CoalesceEngine : public BaseReadEngine
     std::unordered_map<int, std::vector<Addr>> MSHRMap;
 
     int outstandingMemReqQueueSize;
-    std::queue<PacketPtr> outstandingMemReqQueue;
+    std::deque<PacketPtr> outstandingMemReqQueue;
 
-    std::queue<Addr> addrResponseQueue;
-    std::queue<WorkListItem> worklistResponseQueue;
+    std::deque<Addr> addrResponseQueue;
+    std::deque<WorkListItem> worklistResponseQueue;
 
     std::deque<int> evictQueue;
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c305a4bbb9..450ba9ddc4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -109,7 +109,7 @@ PushEngine::recvWLItem(WorkListItem wl)
         return false;
     }
 
-    pushReqQueue.push(wl);
+    pushReqQueue.push_back(wl);
 
     if ((!nextAddrGenEvent.scheduled()) &&
         (!pushReqQueue.empty())) {
@@ -153,10 +153,10 @@ PushEngine::processNextAddrGenEvent()
         reqOffsetMap[pkt->req] = offset_queue[index];
         reqNumEdgeMap[pkt->req] = num_edge_queue[index];
         reqValueMap[pkt->req] = wl.prop;
-        pendingReadReqs.push(pkt);
+        pendingReadReqs.push_back(pkt);
     }
 
-    pushReqQueue.pop();
+    pushReqQueue.pop_front();
 
     if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
         schedule(nextAddrGenEvent, nextCycle());
@@ -175,7 +175,7 @@ PushEngine::processNextReadEvent()
         PacketPtr pkt = pendingReadReqs.front();
         sendMemReq(pkt);
         onTheFlyReadReqs++;
-        pendingReadReqs.pop();
+        pendingReadReqs.pop_front();
     }
 
     if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) {
@@ -187,7 +187,7 @@ bool
 PushEngine::handleMemResp(PacketPtr pkt)
 {
     onTheFlyReadReqs--;
-    memRespQueue.push(pkt);
+    memRespQueue.push_back(pkt);
 
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
         schedule(nextPushEvent, nextCycle());
@@ -224,7 +224,7 @@ PushEngine::processNextPushEvent()
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
 
         if (sendPushUpdate(update) && (i == num_edges - 1)) {
-            memRespQueue.pop();
+            memRespQueue.pop_front();
             // TODO: Erase map entries here.
         }
     }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 81acc9862b..1b1a812d16 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -64,18 +64,18 @@ class PushEngine : public BaseReadEngine
     Addr baseEdgeAddr;
 
     int pushReqQueueSize;
-    std::queue<WorkListItem> pushReqQueue;
+    std::deque<WorkListItem> pushReqQueue;
 
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
     // TODO: Possibility of infinite queueing
-    std::queue<PacketPtr> pendingReadReqs;
+    std::deque<PacketPtr> pendingReadReqs;
 
     int memRespQueueSize;
     int onTheFlyReadReqs;
-    std::queue<PacketPtr> memRespQueue;
+    std::deque<PacketPtr> memRespQueue;
 
     virtual void startup();
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index b874ec65ec..73eacf945f 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -115,7 +115,7 @@ WLEngine::processNextReadEvent()
             onTheFlyUpdateMap[update_addr] = *update_value;
             DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
                 __func__, update_addr, onTheFlyUpdateMap[update_addr]);
-            updateQueue.pop();
+            updateQueue.pop_front();
             DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
         }
     } else {
@@ -127,7 +127,7 @@ WLEngine::processNextReadEvent()
         onTheFlyUpdateMap[update_addr] =
                 std::min(*update_value, onTheFlyUpdateMap[update_addr]);
         stats.onTheFlyCoalesce++;
-        updateQueue.pop();
+        updateQueue.pop_front();
         DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
         // TODO: Add a stat to count the number of coalescions
     }
@@ -188,7 +188,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
         return false;
     }
 
-    updateQueue.push(pkt);
+    updateQueue.push_back(pkt);
     assert(!updateQueue.empty());
     DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
     if (!nextReadEvent.scheduled()) {
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index ef18956ec1..c1ef028f77 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -67,7 +67,7 @@ class WLEngine : public BaseReduceEngine
     CoalesceEngine* coalesceEngine;
 
     int updateQueueSize;
-    std::queue<PacketPtr> updateQueue;
+    std::deque<PacketPtr> updateQueue;
 
     int onTheFlyUpdateMapSize;
     std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;

From 2bfc6c7d5f6c2cb4911a2b72a228be95312a8dad Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 31 Mar 2022 20:25:31 -0700
Subject: [PATCH 078/247] Removing old files and renaming utils to
 data_structs.

---
 src/accl/graph/base/base_reduce_engine.hh     |   5 -
 .../graph/base/{util.hh => data_structs.hh}   |   0
 src/accl/graph/base/old/BaseApplyEngine.py    |  36 ----
 src/accl/graph/base/old/BaseEngine.py         |  39 -----
 src/accl/graph/base/old/BasePushEngine.py     |  36 ----
 src/accl/graph/base/old/BaseWLEngine.py       |  36 ----
 src/accl/graph/base/old/base_apply_engine.cc  | 137 ---------------
 src/accl/graph/base/old/base_apply_engine.hh  |  72 --------
 src/accl/graph/base/old/base_engine.cc        | 100 -----------
 src/accl/graph/base/old/base_engine.hh        |  98 -----------
 src/accl/graph/base/old/base_push_engine.cc   | 145 ----------------
 src/accl/graph/base/old/base_push_engine.hh   |  82 ---------
 src/accl/graph/base/old/base_wl_engine.cc     | 134 ---------------
 src/accl/graph/base/old/base_wl_engine.hh     |  83 ----------
 src/accl/graph/sega/coalesce_engine.hh        |   2 +-
 src/accl/graph/sega/old/ApplyEngine.py        |  38 -----
 src/accl/graph/sega/old/LockDir.py            |  46 ------
 src/accl/graph/sega/old/PushEngine.py         |  37 -----
 src/accl/graph/sega/old/WLEngine.py           |  40 -----
 src/accl/graph/sega/old/apply_engine.cc       |  58 -------
 src/accl/graph/sega/old/apply_engine.hh       |  67 --------
 src/accl/graph/sega/old/lock_dir.cc           |  63 -------
 src/accl/graph/sega/old/lock_dir.hh           |  57 -------
 src/accl/graph/sega/old/push_engine.cc        |  90 ----------
 src/accl/graph/sega/old/push_engine.hh        |  77 ---------
 src/accl/graph/sega/old/wl_engine.cc          | 156 ------------------
 src/accl/graph/sega/old/wl_engine.hh          |  86 ----------
 src/accl/graph/sega/push_engine.hh            |   2 +-
 src/accl/graph/sega/wl_engine.hh              |   3 +-
 29 files changed, 4 insertions(+), 1821 deletions(-)
 rename src/accl/graph/base/{util.hh => data_structs.hh} (100%)
 delete mode 100644 src/accl/graph/base/old/BaseApplyEngine.py
 delete mode 100644 src/accl/graph/base/old/BaseEngine.py
 delete mode 100644 src/accl/graph/base/old/BasePushEngine.py
 delete mode 100644 src/accl/graph/base/old/BaseWLEngine.py
 delete mode 100644 src/accl/graph/base/old/base_apply_engine.cc
 delete mode 100644 src/accl/graph/base/old/base_apply_engine.hh
 delete mode 100644 src/accl/graph/base/old/base_engine.cc
 delete mode 100644 src/accl/graph/base/old/base_engine.hh
 delete mode 100644 src/accl/graph/base/old/base_push_engine.cc
 delete mode 100644 src/accl/graph/base/old/base_push_engine.hh
 delete mode 100644 src/accl/graph/base/old/base_wl_engine.cc
 delete mode 100644 src/accl/graph/base/old/base_wl_engine.hh
 delete mode 100644 src/accl/graph/sega/old/ApplyEngine.py
 delete mode 100644 src/accl/graph/sega/old/LockDir.py
 delete mode 100644 src/accl/graph/sega/old/PushEngine.py
 delete mode 100644 src/accl/graph/sega/old/WLEngine.py
 delete mode 100644 src/accl/graph/sega/old/apply_engine.cc
 delete mode 100644 src/accl/graph/sega/old/apply_engine.hh
 delete mode 100644 src/accl/graph/sega/old/lock_dir.cc
 delete mode 100644 src/accl/graph/sega/old/lock_dir.hh
 delete mode 100644 src/accl/graph/sega/old/push_engine.cc
 delete mode 100644 src/accl/graph/sega/old/push_engine.hh
 delete mode 100644 src/accl/graph/sega/old/wl_engine.cc
 delete mode 100644 src/accl/graph/sega/old/wl_engine.hh

diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
index f2245f571f..c8c9784ed1 100644
--- a/src/accl/graph/base/base_reduce_engine.hh
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -29,8 +29,6 @@
 #ifndef __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
 #define __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
 
-
-#include "accl/graph/base/util.hh"
 #include "params/BaseReduceEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
@@ -43,7 +41,6 @@ class BaseReduceEngine : public ClockedObject
   private:
     System* system;
 
-
   protected:
 
     const RequestorID _requestorId;
@@ -55,8 +52,6 @@ class BaseReduceEngine : public ClockedObject
     ~BaseReduceEngine();
 
     RequestorID requestorId() { return _requestorId; }
-
-    virtual void handleIncomingWL(Addr addr, WorkListItem wl) = 0;
 };
 
 }
diff --git a/src/accl/graph/base/util.hh b/src/accl/graph/base/data_structs.hh
similarity index 100%
rename from src/accl/graph/base/util.hh
rename to src/accl/graph/base/data_structs.hh
diff --git a/src/accl/graph/base/old/BaseApplyEngine.py b/src/accl/graph/base/old/BaseApplyEngine.py
deleted file mode 100644
index 9b240581ac..0000000000
--- a/src/accl/graph/base/old/BaseApplyEngine.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseEngine import BaseEngine
-
-class BaseApplyEngine(BaseEngine):
-    abstract = True
-    type = 'BaseApplyEngine'
-    cxx_header = 'accl/graph/base/base_apply_engine.hh'
-    cxx_class = 'gem5::BaseApplyEngine'
diff --git a/src/accl/graph/base/old/BaseEngine.py b/src/accl/graph/base/old/BaseEngine.py
deleted file mode 100644
index 16c2f402e5..0000000000
--- a/src/accl/graph/base/old/BaseEngine.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
-
-class BaseEngine(ClockedObject):
-    abstract = True
-    type = 'BaseEngine'
-    cxx_header = "accl/graph/base/base_engine.hh"
-    cxx_class = 'gem5::BaseEngine'
-
-    system = Param.System(Parent.any, 'System this Engine is a part of')
-    mem_port  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/old/BasePushEngine.py b/src/accl/graph/base/old/BasePushEngine.py
deleted file mode 100644
index 2163864be3..0000000000
--- a/src/accl/graph/base/old/BasePushEngine.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseEngine import BaseEngine
-
-class BasePushEngine(BaseEngine):
-    abstract = True
-    type = 'BasePushEngine'
-    cxx_header = "accl/graph/base/base_push_engine.hh"
-    cxx_class = 'gem5::BasePushEngine'
diff --git a/src/accl/graph/base/old/BaseWLEngine.py b/src/accl/graph/base/old/BaseWLEngine.py
deleted file mode 100644
index 7311c396b3..0000000000
--- a/src/accl/graph/base/old/BaseWLEngine.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseEngine import BaseEngine
-
-class BaseWLEngine(BaseEngine):
-    abstract = True
-    type = 'BaseWLEngine'
-    cxx_header = "accl/graph/base/base_wl_engine.hh"
-    cxx_class = 'gem5::BaseWLEngine'
diff --git a/src/accl/graph/base/old/base_apply_engine.cc b/src/accl/graph/base/old/base_apply_engine.cc
deleted file mode 100644
index 39f5dafc67..0000000000
--- a/src/accl/graph/base/old/base_apply_engine.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/base_apply_engine.hh"
-
-#include <string>
-
-#include "accl/graph/base/util.hh"
-#include "debug/MPU.hh"
-
-
-namespace gem5
-{
-
-BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
-    BaseEngine(params),
-    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
-    nextApplyEvent([this]{ processNextApplyEvent(); }, name())
-{}
-
-bool
-BaseApplyEngine::recvWLNotif(Addr addr)
-{
-    // TODO: Investigate the situation where the queue is full.
-    applyReadQueue.push(addr);
-    if (!nextApplyCheckEvent.scheduled()){
-        schedule(nextApplyCheckEvent, nextCycle());
-    }
-    return true;
-}
-
-void
-BaseApplyEngine::processNextApplyCheckEvent()
-{
-    // TODO: We might want to change the way this function
-    // pops items off queue, maybe we should pop every n cycles
-    // or change the clock domain for this simobject.
-    Addr addr = applyReadQueue.front();
-    Addr req_addr = (addr / 64) * 64;
-    Addr req_offset = (addr % 64);
-    if (acquireAddress(req_addr)) {
-        PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
-        requestOffset[memPkt->req] = req_offset;
-        if (!memPortBlocked()) {
-            sendMemReq(memPkt);
-            applyReadQueue.pop();
-        }
-    }
-    if (!applyReadQueue.empty() &&  !nextApplyCheckEvent.scheduled()){
-        schedule(nextApplyCheckEvent, nextCycle());
-    }
-}
-
-void
-BaseApplyEngine::processNextApplyEvent()
-{
-    PacketPtr pkt = memRespQueue.front();
-    uint8_t* data = pkt->getPtr<uint8_t>();
-
-    RequestPtr request = pkt->req;
-    Addr request_offset = requestOffset[request];
-
-    WorkListItem wl = memoryToWorkList(data + request_offset);
-    DPRINTF(MPU, "%s: Apply Engine is reading WorkList Item[%lu]: %s\n"
-                , __func__, pkt->getAddr() + request_offset, wl.to_string());
-    // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem
-    // to applyengine if temp_prop < prop. If temp_prop has not changed, why
-    // fwd it to applyengine?
-    if (wl.temp_prop < wl.prop) {
-        // TODO: instead of min add a Reduce function.
-        //update prop with temp_prop
-        wl.prop = wl.temp_prop;
-        //write back the new worklist item to  memory
-        uint8_t* wList = workListToMemory(wl);
-        memcpy(data + request_offset, wList, sizeof(WorkListItem));
-        //Create memory write requests.
-        PacketPtr writePkt  =
-        getWritePacket(pkt->getAddr(), 64, data, requestorId);
-
-        DPRINTF(MPU, "%s: Sending a pkt with this info. "
-                "pkt->addr: %lu, pkt->size: %lu\npkt->data: %s\n",
-                __func__, writePkt->getAddr(),
-                writePkt->getSize(), writePkt->printData());
-
-        if (!memPortBlocked()) {
-            if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
-                sendMemReq(writePkt);
-                memRespQueue.pop();
-                DPRINTF(MPU, "%s: The Apply Engine is applying the new value into WorkList Item[%lu]: %s\n"
-                              , __func__, pkt->getAddr() + request_offset, wl.to_string());
-            }
-        }
-    } else {
-        memRespQueue.pop();
-    }
-    if (!releaseAddress(pkt->getAddr())) {
-        panic("Could not release an address");
-    }
-    if (!nextApplyEvent.scheduled() && !memRespQueue.empty()){
-        schedule(nextApplyEvent, nextCycle());
-    }
-}
-
-void
-BaseApplyEngine::scheduleMainEvent()
-{
-    if (!memRespQueue.empty() && !nextApplyEvent.scheduled()) {
-        schedule(nextApplyEvent, nextCycle());
-    }
-}
-
-}
diff --git a/src/accl/graph/base/old/base_apply_engine.hh b/src/accl/graph/base/old/base_apply_engine.hh
deleted file mode 100644
index f4df298079..0000000000
--- a/src/accl/graph/base/old/base_apply_engine.hh
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
-
-#include <queue>
-#include <unordered_map>
-
-#include "accl/graph/base/base_engine.hh"
-#include "mem/request.hh"
-#include "params/BaseApplyEngine.hh"
-
-namespace gem5
-{
-
-class BaseApplyEngine : public BaseEngine
-{
-  private:
-    std::queue<Addr> applyReadQueue;
-
-    std::unordered_map<RequestPtr, Addr> requestOffset;
-
-    EventFunctionWrapper nextApplyCheckEvent;
-    void processNextApplyCheckEvent();
-
-    EventFunctionWrapper nextApplyEvent;
-    void processNextApplyEvent();
-
-  protected:
-    virtual bool sendApplyNotif(uint32_t prop,
-            uint32_t degree, uint32_t edgeIndex) = 0;
-    virtual bool acquireAddress(Addr addr) = 0;
-    virtual bool releaseAddress(Addr addr) = 0;
-    virtual void scheduleMainEvent() override;
-
-  public:
-    PARAMS(BaseApplyEngine);
-
-    BaseApplyEngine(const BaseApplyEngineParams &apply);
-
-    bool recvWLNotif(Addr addr);
-};
-
-}
-
-#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/base/old/base_engine.cc b/src/accl/graph/base/old/base_engine.cc
deleted file mode 100644
index ad87bb3662..0000000000
--- a/src/accl/graph/base/old/base_engine.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/base_engine.hh"
-#include "debug/MPU.hh"
-namespace gem5
-{
-
-BaseEngine::BaseEngine(const BaseEngineParams &params) :
-    ClockedObject(params),
-    system(params.system),
-    memPort(name() + ".memPort", this),
-    requestorId(system->getRequestorId(this))
-{
-    DPRINTF(MPU, "%s: My requestorId is %u,\n", __func__, requestorId);
-}
-
-BaseEngine::~BaseEngine()
-{}
-
-Port&
-BaseEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "mem_port") {
-        return memPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
-}
-
-void
-BaseEngine::MemPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-BaseEngine::MemPort::recvTimingResp(PacketPtr pkt)
-{
-    //TODO: Investigate sending true all the time
-    return owner->handleMemResp(pkt);
-
-}
-
-void
-BaseEngine::MemPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
-bool
-BaseEngine::handleMemResp(PacketPtr pkt)
-{
-    if (pkt->isResponse() && pkt->isWrite()) {
-        return true;
-    }
-    memRespQueue.push(pkt);
-    scheduleMainEvent();
-    return true;
-}
-
-}
diff --git a/src/accl/graph/base/old/base_engine.hh b/src/accl/graph/base/old/base_engine.hh
deleted file mode 100644
index 53415ddc7c..0000000000
--- a/src/accl/graph/base/old/base_engine.hh
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_BASE_BASE_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_ENGINE_HH__
-
-#include <queue>
-#include <unordered_map>
-
-#include "base/addr_range.hh"
-#include "mem/packet.hh"
-#include "mem/port.hh"
-#include "params/BaseEngine.hh"
-#include "sim/clocked_object.hh"
-#include "sim/system.hh"
-
-namespace gem5
-{
-
-class BaseEngine : public ClockedObject
-{
-  private:
-    class MemPort : public RequestPort
-    {
-      private:
-        BaseEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-        public:
-        MemPort(const std::string& name, BaseEngine* owner):
-            RequestPort(name, owner), owner(owner),
-            _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-        protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    System* system;
-    MemPort memPort;
-
-    bool handleMemResp(PacketPtr resp);
-
-  protected:
-    const RequestorID requestorId;
-    // TODO: Add this later, maybe?
-    // int memRespQueueSize;
-    std::queue<PacketPtr> memRespQueue;
-
-    bool memPortBlocked() { return memPort.blocked(); }
-    void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
-    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
-    AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); }
-
-    virtual void scheduleMainEvent() = 0;
-
-  public:
-    PARAMS(BaseEngine);
-
-    BaseEngine(const BaseEngineParams &params);
-    ~BaseEngine();
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
-};
-
-}
-
-#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/base/old/base_push_engine.cc b/src/accl/graph/base/old/base_push_engine.cc
deleted file mode 100644
index 4ebe40e486..0000000000
--- a/src/accl/graph/base/old/base_push_engine.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2021 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/base_push_engine.hh"
-
-#include "accl/graph/base/util.hh"
-#include "debug/MPU.hh"
-
-namespace gem5
-{
-
-BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
-    BaseEngine(params),
-    nextReadEvent([this] { processNextReadEvent(); }, name()),
-    nextPushEvent([this] { processNextPushEvent(); }, name())
-{}
-
-bool
-BasePushEngine::recvApplyNotif(uint32_t prop,
-        uint32_t degree, uint32_t edge_index)
-{
-    notifQueue.emplace(prop, degree, edge_index);
-    if (!nextReadEvent.scheduled()) {
-        schedule(nextReadEvent, nextCycle());
-    }
-    DPRINTF(MPU, "%s: Reading %d edges.", __func__, degree);
-    return true;
-}
-
-void
-BasePushEngine::processNextReadEvent()
-{
-    ApplyNotif notif = notifQueue.front();
-
-    std::vector<Addr> addr_queue;
-    std::vector<Addr> offset_queue;
-    std::vector<int> num_edge_queue;
-
-    for (uint32_t index = 0; index < notif.degree; index++) {
-        // FIXME: For now the base edge address is 1048576
-        Addr edge_addr = 1048576 + (notif.edgeIndex + index) * sizeof(Edge);
-        Addr req_addr = (edge_addr / 64) * 64;
-        Addr req_offset = edge_addr % 64;
-        if (addr_queue.size()) {
-            if (addr_queue.back() == req_addr) {
-                num_edge_queue.back()++;
-            }
-            else {
-                addr_queue.push_back(req_addr);
-                offset_queue.push_back(req_offset);
-                num_edge_queue.push_back(1);
-            }
-        }
-        else {
-            addr_queue.push_back(req_addr);
-            offset_queue.push_back(req_offset);
-            num_edge_queue.push_back(1);
-        }
-    };
-
-    for (int index = 0; index < addr_queue.size(); index++) {
-        if (!memPortBlocked()) {
-            PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
-            reqOffsetMap[pkt->req] = offset_queue[index];
-            reqNumEdgeMap[pkt->req] = num_edge_queue[index];
-            reqValueMap[pkt->req] = notif.prop;
-            sendMemReq(pkt);
-            notifQueue.pop();
-        }
-    }
-
-    if (!nextReadEvent.scheduled() && !notifQueue.empty()) {
-        schedule(nextReadEvent, nextCycle());
-    }
-}
-
-void
-BasePushEngine::processNextPushEvent()
-{
-    PacketPtr pkt = memRespQueue.front();
-    RequestPtr req = pkt->req;
-    uint8_t *data = pkt->getPtr<uint8_t>();
-
-    Addr offset = reqOffsetMap[req];
-    int num_edges = reqNumEdgeMap[req];
-    uint32_t value = reqValueMap[req];
-
-    int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
-    for (int i = 0; i < num_edges; i++) {
-        uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
-        Edge e = memoryToEdge(curr_edge_data);
-        int data_size = sizeof(uint32_t) / sizeof(uint8_t);
-        uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
-        // TODO: Implement propagate function here
-        *update_data = value + 1;
-        PacketPtr update = getUpdatePacket(e.neighbor,
-            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
-            requestorId);
-        if (sendPushUpdate(update) && (i == num_edges - 1)) {
-            memRespQueue.pop();
-            DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
-                , __func__, e.to_string(), *update_data);
-            // TODO: Erase map entries here.
-        }
-    }
-
-    if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
-        schedule(nextPushEvent, nextCycle());
-    }
-}
-
-void
-BasePushEngine::scheduleMainEvent()
-{
-    if (!memRespQueue.empty() && !nextPushEvent.scheduled()) {
-        schedule(nextPushEvent, nextCycle());
-    }
-}
-
-}
diff --git a/src/accl/graph/base/old/base_push_engine.hh b/src/accl/graph/base/old/base_push_engine.hh
deleted file mode 100644
index 01027d2791..0000000000
--- a/src/accl/graph/base/old/base_push_engine.hh
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2021 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
-
-#include <queue>
-
-#include "accl/graph/base/base_engine.hh"
-#include "mem/request.hh"
-#include "params/BasePushEngine.hh"
-
-namespace gem5
-{
-
-class BasePushEngine : public BaseEngine
-{
-  private:
-    struct ApplyNotif {
-        uint32_t prop;
-        uint32_t degree;
-        uint32_t edgeIndex;
-
-        ApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index):
-        prop(prop), degree(degree), edgeIndex(edge_index)
-        {}
-    };
-
-    std::queue<ApplyNotif> notifQueue;
-    // int notifQueueSize;
-
-    std::unordered_map<RequestPtr, Addr> reqOffsetMap;
-    std::unordered_map<RequestPtr, int> reqNumEdgeMap;
-    std::unordered_map<RequestPtr, uint32_t> reqValueMap;
-
-    EventFunctionWrapper nextReadEvent;
-    void processNextReadEvent();
-
-    EventFunctionWrapper nextPushEvent;
-    void processNextPushEvent();
-
-  protected:
-    virtual bool sendPushUpdate(PacketPtr pkt) = 0;
-    virtual void scheduleMainEvent() override;
-
-  public:
-
-    PARAMS(BasePushEngine);
-
-    BasePushEngine(const BasePushEngineParams &params);
-
-    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
-};
-
-}
-
-#endif // __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/base/old/base_wl_engine.cc b/src/accl/graph/base/old/base_wl_engine.cc
deleted file mode 100644
index fd45b85077..0000000000
--- a/src/accl/graph/base/old/base_wl_engine.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/base_wl_engine.hh"
-#include "debug/MPU.hh"
-
-#include <string>
-
-namespace gem5
-{
-
-BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
-    BaseEngine(params),
-    nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()),
-    nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name())
-{}
-
-bool
-BaseWLEngine::handleWLUpdate(PacketPtr pkt)
-{
-    updateQueue.push(pkt);
-    if(!nextWLReadEvent.scheduled()) {
-        schedule(nextWLReadEvent, nextCycle());
-    }
-    return true;
-}
-
-void BaseWLEngine::processNextWLReadEvent()
-{
-    PacketPtr pkt = updateQueue.front();
-    uint32_t value = *(pkt->getPtr<uint32_t>());
-
-    Addr addr = pkt->getAddr();
-    Addr req_addr = (addr / 64) * 64;
-    Addr req_offset = addr % 64;
-
-    if (acquireAddress(req_addr)) {
-        PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
-        requestOffsetMap[memPkt->req] = req_offset;
-        requestValueMap[memPkt->req] = value;
-
-        if (!memPortBlocked()) {
-            sendMemReq(memPkt);
-            updateQueue.pop();
-        }
-        else{
-            releaseAddress(req_addr);
-        }
-    }
-    if (!nextWLReadEvent.scheduled() && !updateQueue.empty()) {
-        schedule(nextWLReadEvent, nextCycle());
-    }
-}
-
-void
-BaseWLEngine::processNextWLReduceEvent()
-{
-    PacketPtr resp = memRespQueue.front();
-    uint8_t* respData = resp->getPtr<uint8_t>();
-    Addr request_offset = requestOffsetMap[resp->req];
-    uint32_t value = requestValueMap[resp->req];
-    WorkListItem wl =  memoryToWorkList(respData + request_offset);
-
-    DPRINTF(MPU, "%s: The WLE is reading WorkList item [%lu]: %s %d\n"
-                , __func__, resp->getAddr() + request_offset, wl.to_string(), value);
-    if (value < wl.temp_prop){
-        //update prop with temp_prop
-        wl.temp_prop = value;
-
-        uint8_t* wlData = workListToMemory(wl);
-        memcpy(respData + request_offset, wlData, sizeof(WorkListItem));
-        PacketPtr writePkt  =
-        getWritePacket(resp->getAddr(), 64, respData, requestorId);
-
-        DPRINTF(MPU, "%s: Sending a pkt with this info. "
-                "pkt->addr: %lu, pkt->size: %lu\npkt->data: %s\n",
-                __func__, writePkt->getAddr(),
-                writePkt->getSize(), writePkt->printData());
-        if (!memPortBlocked()) {
-            if (sendWLNotif(resp->getAddr() + request_offset)) {
-                sendMemReq(writePkt);
-                memRespQueue.pop();
-                DPRINTF(MPU, "%s: The WLE is changing to: %s\n"
-                , __func__, wl.to_string());
-                // TODO: Erase map entries, delete wlData;
-            }
-        }
-    }
-    else {
-        memRespQueue.pop();
-    }
-    if (!releaseAddress(resp->getAddr())) {
-        panic("Could not release an address");
-    }
-    if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){
-            schedule(nextWLReduceEvent, nextCycle());
-    }
-}
-
-void
-BaseWLEngine::scheduleMainEvent()
-{
-    if (!memRespQueue.empty() && !nextWLReduceEvent.scheduled()) {
-        schedule(nextWLReduceEvent, nextCycle());
-    }
-}
-
-
-}
diff --git a/src/accl/graph/base/old/base_wl_engine.hh b/src/accl/graph/base/old/base_wl_engine.hh
deleted file mode 100644
index 15371f965b..0000000000
--- a/src/accl/graph/base/old/base_wl_engine.hh
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
-
-#include <queue>
-#include <unordered_map>
-
-#include "accl/graph/base/base_engine.hh"
-#include "accl/graph/base/util.hh"
-#include "params/BaseWLEngine.hh"
-
-namespace gem5
-{
-
-class BaseWLEngine : public BaseEngine
-{
-  private:
-    std::queue<PacketPtr> updateQueue;
-    std::queue<PacketPtr> responseQueue;
-
-    std::unordered_map<RequestPtr, Addr> requestOffsetMap;
-    std::unordered_map<RequestPtr, uint32_t> requestValueMap;
-
-    //Events
-    EventFunctionWrapper nextWLReadEvent;
-    void processNextWLReadEvent();
-    /* Syncronously checked
-       If there are any active vertecies:
-       create memory read packets + MPU::MPU::MemPortsendTimingReq
-    */
-
-    EventFunctionWrapper nextWLReduceEvent;
-    void processNextWLReduceEvent();
-    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
-       Perform apply and send the write request and read edgeList
-       read + write
-       Write edgelist loc in buffer
-    */
-  protected:
-    virtual bool sendWLNotif(Addr addr) = 0;
-    virtual bool acquireAddress(Addr addr) = 0;
-    virtual bool releaseAddress(Addr addr) = 0;
-    virtual void scheduleMainEvent() override;
-
-  public:
-
-    PARAMS(BaseWLEngine);
-
-    BaseWLEngine(const BaseWLEngineParams &params);
-
-    bool handleWLUpdate(PacketPtr pkt);
-};
-
-}
-
-#endif // __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 2cb9856f76..ff30efde4c 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -30,7 +30,7 @@
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
 #include "accl/graph/base/base_read_engine.hh"
-#include "accl/graph/base/util.hh"
+#include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
diff --git a/src/accl/graph/sega/old/ApplyEngine.py b/src/accl/graph/sega/old/ApplyEngine.py
deleted file mode 100644
index 7a446bb620..0000000000
--- a/src/accl/graph/sega/old/ApplyEngine.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseApplyEngine import BaseApplyEngine
-
-class ApplyEngine(BaseApplyEngine):
-    type = 'ApplyEngine'
-    cxx_header = "accl/graph/sega/apply_engine.hh"
-    cxx_class = 'gem5::ApplyEngine'
-
-    push_engine = Param.PushEngine(Parent.any, "MPU object that owns this ApplyEngine")
-    lock_dir = Param.LockDirectory(NULL, "The lock directory to acquire locks from")
diff --git a/src/accl/graph/sega/old/LockDir.py b/src/accl/graph/sega/old/LockDir.py
deleted file mode 100644
index d21963dc3a..0000000000
--- a/src/accl/graph/sega/old/LockDir.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2012-2014, 2017-2018 ARM Limited
-# All rights reserved.
-#
-# The license below extends only to copyright in the software and shall
-# not be construed as granting a license to any other intellectual
-# property including but not limited to intellectual property relating
-# to a hardware implementation of the functionality of the software
-# licensed hereunder.  You may use the software subject to the license
-# terms below provided that you ensure that this notice is replicated
-# unmodified and in its entirety in all distributions of the software,
-# modified or unmodified, in source code or in binary form.
-#
-# Copyright (c) 2007 The Regents of The University of Michigan
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.SimObject import SimObject
-
-class LockDirectory(SimObject):
-    type = 'LockDirectory'
-    cxx_header = 'accl/graph/sega/lock_dir.hh'
-    cxx_class = 'gem5::LockDirectory'
diff --git a/src/accl/graph/sega/old/PushEngine.py b/src/accl/graph/sega/old/PushEngine.py
deleted file mode 100644
index a743b57262..0000000000
--- a/src/accl/graph/sega/old/PushEngine.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BasePushEngine import BasePushEngine
-
-class PushEngine(BasePushEngine):
-    type = 'PushEngine'
-    cxx_header = "accl/graph/sega/push_engine.hh"
-    cxx_class = 'gem5::PushEngine'
-
-    req_port  = RequestPort("Port to send updates to the outside")
diff --git a/src/accl/graph/sega/old/WLEngine.py b/src/accl/graph/sega/old/WLEngine.py
deleted file mode 100644
index b6e697266e..0000000000
--- a/src/accl/graph/sega/old/WLEngine.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseWLEngine import BaseWLEngine
-
-class WLEngine(BaseWLEngine):
-    type = 'WLEngine'
-    cxx_header = "accl/graph/sega/wl_engine.hh"
-    cxx_class = 'gem5::WLEngine'
-
-    resp_port = ResponsePort("Port to Receive updates from outside")
-    apply_engine = Param.ApplyEngine(Parent.any,
-            "MPU object that owns this WLEngine")
-    lock_dir = Param.LockDirectory(NULL, "The lock directory to acquire locks from")
diff --git a/src/accl/graph/sega/old/apply_engine.cc b/src/accl/graph/sega/old/apply_engine.cc
deleted file mode 100644
index 544bb082ad..0000000000
--- a/src/accl/graph/sega/old/apply_engine.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/apply_engine.hh"
-
-namespace gem5{
-
-ApplyEngine::ApplyEngine(const ApplyEngineParams &params) :
-    BaseApplyEngine(params),
-    pushEngine(params.push_engine),
-    lockDir(params.lock_dir)
-{}
-
-bool
-ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
-{
-    return pushEngine->recvApplyNotif(prop, degree, edgeIndex);
-
-}
-
-bool
-ApplyEngine::acquireAddress(Addr addr)
-{
-    return lockDir->acquire(addr, requestorId);
-}
-
-bool
-ApplyEngine::releaseAddress(Addr addr)
-{
-    return lockDir->release(addr, requestorId);
-}
-
-}
diff --git a/src/accl/graph/sega/old/apply_engine.hh b/src/accl/graph/sega/old/apply_engine.hh
deleted file mode 100644
index c88330487a..0000000000
--- a/src/accl/graph/sega/old/apply_engine.hh
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
-#define __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
-
-#include <queue>
-#include <unordered_map>
-
-#include "accl/graph/base/base_apply_engine.hh"
-#include "accl/graph/sega/lock_dir.hh"
-#include "accl/graph/sega/push_engine.hh"
-#include "mem/packet.hh"
-#include "mem/port.hh"
-#include "params/ApplyEngine.hh"
-#include "sim/clocked_object.hh"
-#include "sim/port.hh"
-
-namespace gem5
-{
-
-
-class ApplyEngine : public BaseApplyEngine
-{
-  private:
-    PushEngine* pushEngine;
-    LockDirectory* lockDir;
-
-  protected:
-    virtual bool sendApplyNotif(uint32_t prop,
-        uint32_t degree, uint32_t edgeIndex) override;
-    virtual bool acquireAddress(Addr addr) override;
-    virtual bool releaseAddress(Addr addr) override;
-
-  public:
-    PARAMS(ApplyEngine);
-    ApplyEngine(const ApplyEngineParams &params);
-};
-
-}
-
-#endif // __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/sega/old/lock_dir.cc b/src/accl/graph/sega/old/lock_dir.cc
deleted file mode 100644
index 6a4496175d..0000000000
--- a/src/accl/graph/sega/old/lock_dir.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/lock_dir.hh"
-
-namespace gem5
-{
-
-LockDirectory::LockDirectory(const LockDirectoryParams &params) :
-    SimObject(params)
-{}
-
-bool
-LockDirectory::acquire(Addr addr, RequestorID requestorId)
-{
-    if (lockOwnerMap.find(addr) == lockOwnerMap.end()) {
-        lockOwnerMap[addr] = requestorId;
-        return true;
-    } else {
-        return false;
-    }
-}
-
-bool
-LockDirectory::release(Addr addr, RequestorID requestorId)
-{
-    if (lockOwnerMap.find(addr) == lockOwnerMap.end()) {
-        panic("Should not relase an address before acquiring");
-    } else if (lockOwnerMap[addr] != requestorId) {
-        panic("Should not release and address you don't own");
-    } else {
-        lockOwnerMap.erase(addr);
-        return true;
-    }
-    return false;
-}
-
-}
diff --git a/src/accl/graph/sega/old/lock_dir.hh b/src/accl/graph/sega/old/lock_dir.hh
deleted file mode 100644
index 012334ce43..0000000000
--- a/src/accl/graph/sega/old/lock_dir.hh
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_LOCK_DIR_HH__
-#define __ACCL_GRAPH_SEGA_LOCK_DIR_HH__
-
-#include <unordered_map>
-
-#include "mem/packet.hh"
-#include "params/LockDirectory.hh"
-#include "sim/sim_object.hh"
-
-namespace gem5
-{
-
-class LockDirectory: public SimObject
-{
-  private:
-    std::unordered_map<Addr, RequestorID> lockOwnerMap;
-    // std::unordered_map<Addr, int> lockDegreeMap;
-
-  public:
-    PARAMS(LockDirectory);
-    LockDirectory(const LockDirectoryParams &params);
-
-    bool acquire(Addr addr, RequestorID requestorId);
-    bool release(Addr addr, RequestorID requestorId);
-};
-
-}
-
-#endif
diff --git a/src/accl/graph/sega/old/push_engine.cc b/src/accl/graph/sega/old/push_engine.cc
deleted file mode 100644
index c7b229ad33..0000000000
--- a/src/accl/graph/sega/old/push_engine.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2021 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/push_engine.hh"
-
-namespace gem5
-{
-
-PushEngine::PushEngine(const PushEngineParams &params) :
-    BasePushEngine(params),
-    reqPort(name() + "reqPort", this)
-{}
-
-Port&
-PushEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "req_port") {
-        return reqPort;
-    } else {
-        return BasePushEngine::getPort(if_name, idx);
-    }
-}
-
-void
-PushEngine::ReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-PushEngine::ReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
-bool
-PushEngine::sendPushUpdate(PacketPtr pkt)
-{
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(pkt);
-        return true;
-    }
-    return false;
-}
-
-}
diff --git a/src/accl/graph/sega/old/push_engine.hh b/src/accl/graph/sega/old/push_engine.hh
deleted file mode 100644
index 604df4750d..0000000000
--- a/src/accl/graph/sega/old/push_engine.hh
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2021 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
-#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
-
-#include "accl/graph/base/base_push_engine.hh"
-#include "params/PushEngine.hh"
-
-namespace gem5
-{
-
-class MPU;
-
-class PushEngine : public BasePushEngine
-{
-  private:
-    class ReqPort : public RequestPort
-    {
-      private:
-        PushEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        ReqPort(const std::string& name, PushEngine* owner) :
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    ReqPort reqPort;
-
-  protected:
-    virtual bool sendPushUpdate(PacketPtr pkt) override;
-
-  public:
-    PARAMS(PushEngine);
-    PushEngine(const PushEngineParams &params);
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-};
-
-}
-
-#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/sega/old/wl_engine.cc b/src/accl/graph/sega/old/wl_engine.cc
deleted file mode 100644
index 03f74f1019..0000000000
--- a/src/accl/graph/sega/old/wl_engine.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/wl_engine.hh"
-#include "debug/MPU.hh"
-namespace gem5
-{
-
-WLEngine::WLEngine(const WLEngineParams &params):
-    BaseWLEngine(params),
-    respPort(name() + ".respPort", this),
-    applyEngine(params.apply_engine),
-    lockDir(params.lock_dir)
-{}
-
-Port&
-WLEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "resp_port") {
-        return respPort;
-    } else {
-        return BaseWLEngine::getPort(if_name, idx);
-    }
-}
-
-void
-WLEngine::startup()
-{
-    //FIXME: This is the current version of our initializer.
-    // This should be updated in the future.
-    WorkListItem vertices [5] = {
-                                {10000, 10000, 3, 0}, // Addr: 0
-                                {10000, 10000, 1, 3}, // Addr: 16
-                                {10000, 10000, 1, 4}, // Addr: 32
-                                {10000, 10000, 1, 5}, // Addr: 48
-                                {10000, 10000, 0, 6}  // Addr: 64
-                                };
-    Edge edges [7] = {
-                    {0, 16}, // Addr: 1048576
-                    {0, 32}, // Addr: 1048592
-                    {0, 48}, // Addr: 1048608
-                    {0, 32}, // Addr: 1048624
-                    {0, 64},  // Addr: 1048640
-                    {0, 32}
-                    };
-
-    for (int i = 0; i < 5; i++) {
-        uint8_t* data = workListToMemory(vertices[i]);
-        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
-                                        16, data, 0);
-        sendMemFunctional(pkt);
-    }
-
-    for (int i = 0; i < 7; i++) {
-        uint8_t* data = edgeToMemory(edges[i]);
-        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
-                                        16, data, 0);
-        sendMemFunctional(pkt);
-    }
-
-    uint8_t* first_update_data = new uint8_t [4];
-    uint32_t* tempPtr = (uint32_t*) first_update_data;
-    *tempPtr = 0;
-
-    PacketPtr first_update = getUpdatePacket(
-        0, 4, first_update_data, requestorId);
-
-    handleWLUpdate(first_update);
-}
-
-bool
-WLEngine::sendWLNotif(Addr addr){
-    return applyEngine->recvWLNotif(addr);
-}
-
-AddrRangeList
-WLEngine::RespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-bool
-WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
-{
-    return owner->handleWLUpdate(pkt);
-}
-
-Tick
-WLEngine::RespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-WLEngine::RespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-WLEngine::RespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-void
-WLEngine::recvFunctional(PacketPtr pkt)
-{
-    // FIXME: This needs to be fixed
-    // if (pkt->cmd == MemCmd::UpdateWL) {
-    //     panic("Functional requests should not be made to WL.");
-    //     //TODO: Might be a good idea to implement later.
-    //     // wlEngine->recvFunctional(pkt);
-    // } else {
-        sendMemFunctional(pkt);
-    // }
-}
-
-bool
-WLEngine::acquireAddress(Addr addr)
-{
-    return lockDir->acquire(addr, requestorId);
-}
-
-bool
-WLEngine::releaseAddress(Addr addr)
-{
-    return lockDir->release(addr, requestorId);
-}
-
-}
diff --git a/src/accl/graph/sega/old/wl_engine.hh b/src/accl/graph/sega/old/wl_engine.hh
deleted file mode 100644
index 4e8a25795a..0000000000
--- a/src/accl/graph/sega/old/wl_engine.hh
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
-#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
-
-#include <queue>
-#include <unordered_map>
-
-#include "accl/graph/base/base_wl_engine.hh"
-#include "accl/graph/sega/apply_engine.hh"
-#include "accl/graph/sega/lock_dir.hh"
-#include "params/WLEngine.hh"
-
-namespace gem5
-{
-
-class ApplyEngine;
-
-class WLEngine : public BaseWLEngine
-{
-  private:
-    class RespPort : public ResponsePort
-    {
-      private:
-        WLEngine* owner;
-
-      public:
-        RespPort(const std::string& name, WLEngine* owner):
-          ResponsePort(name, owner), owner(owner)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    RespPort respPort;
-    ApplyEngine* applyEngine;
-    LockDirectory* lockDir;
-
-    virtual void startup();
-    void recvFunctional(PacketPtr pkt);
-
-  protected:
-    virtual bool sendWLNotif(Addr addr) override;
-    virtual bool acquireAddress(Addr addr) override;
-    virtual bool releaseAddress(Addr addr) override;
-
-  public:
-    PARAMS(WLEngine);
-    WLEngine(const WLEngineParams &params);
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
-};
-
-}
-#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 1b1a812d16..4c9822345f 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -30,7 +30,7 @@
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
 #include "accl/graph/base/base_read_engine.hh"
-#include "accl/graph/base/util.hh"
+#include "accl/graph/base/data_structs.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index c1ef028f77..a8dff32d44 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -33,6 +33,7 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_reduce_engine.hh"
+#include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "base/statistics.hh"
 #include "params/WLEngine.hh"
@@ -108,7 +109,7 @@ class WLEngine : public BaseReduceEngine
 
     bool handleIncomingUpdate(PacketPtr pkt);
 
-    virtual void handleIncomingWL(Addr addr, WorkListItem wl);
+    void handleIncomingWL(Addr addr, WorkListItem wl);
 };
 
 }

From 2d18a7b77fb6e0bdbb5d9fae5ef92ee9a3181311 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 1 Apr 2022 11:07:05 -0700
Subject: [PATCH 079/247] Fixing bugs.

---
 configs/accl/sega.py               |   9 +--
 src/accl/graph/sega/push_engine.cc | 110 +++++++++++++++--------------
 src/accl/graph/sega/push_engine.hh |   6 +-
 src/accl/graph/sega/wl_engine.cc   |  23 +++---
 4 files changed, 78 insertions(+), 70 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 680157ba7e..a0c7766fe0 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -5,11 +5,12 @@ class MPU(SubSystem):
     def __init__(self):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=0x80000000,
-                                    push_req_queue_size = 16)
+                                    push_req_queue_size=16,
+                                    mem_resp_queue_size=8)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                    update_queue_size = 16,
+                                    update_queue_size=16,
                                     on_the_fly_update_map_size=8)
         self.interconnect = SystemXBar()
 
@@ -60,9 +61,9 @@ def __init__(self):
         self.mpu = MPU()
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="epinions/graph_binaries/vertices",
+            vertex_binary="facebook/graph_binaries/vertices",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="epinions/graph_binaries/edgelist_0")
+            edge_binary="facebook/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setMemPort(self.mem_ctrl.getPort())
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 450ba9ddc4..0b4c981d48 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -65,6 +65,7 @@ PushEngine::startup()
     *tempPtr = 0;
 
     PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
+    // PacketPtr first_update = createUpdatePacket(0, 4, (uint32_t) 0);
 
     sendPushUpdate(first_update);
 }
@@ -109,7 +110,11 @@ PushEngine::recvWLItem(WorkListItem wl)
         return false;
     }
 
-    pushReqQueue.push_back(wl);
+    Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
+    Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
+    uint32_t update_value = wl.prop;
+    pushReqQueue.push_back(
+        std::make_pair(std::make_pair(start_addr, end_addr), update_value));
 
     if ((!nextAddrGenEvent.scheduled()) &&
         (!pushReqQueue.empty())) {
@@ -121,43 +126,36 @@ PushEngine::recvWLItem(WorkListItem wl)
 void
 PushEngine::processNextAddrGenEvent()
 {
-    WorkListItem wl = pushReqQueue.front();
-
-    std::vector<Addr> addr_queue;
-    std::vector<Addr> offset_queue;
-    std::vector<int> num_edge_queue;
-
-    for (uint32_t index = 0; index < wl.degree; index++) {
-        Addr edge_addr = baseEdgeAddr + (wl.edgeIndex + index) * sizeof(Edge);
-        Addr req_addr = (edge_addr / 64) * 64;
-        Addr req_offset = edge_addr % 64;
-        if (addr_queue.size()) {
-            if (addr_queue.back() == req_addr) {
-                num_edge_queue.back()++;
-            }
-            else {
-                addr_queue.push_back(req_addr);
-                offset_queue.push_back(req_offset);
-                num_edge_queue.push_back(1);
-            }
-        }
-        else {
-            addr_queue.push_back(req_addr);
-            offset_queue.push_back(req_offset);
-            num_edge_queue.push_back(1);
-        }
-    };
-
-    for (int index = 0; index < addr_queue.size(); index++) {
-        PacketPtr pkt = createReadPacket(addr_queue[index], 64);
-        reqOffsetMap[pkt->req] = offset_queue[index];
-        reqNumEdgeMap[pkt->req] = num_edge_queue[index];
-        reqValueMap[pkt->req] = wl.prop;
-        pendingReadReqs.push_back(pkt);
+    Addr start_addr, end_addr;
+    uint32_t update_value;
+
+    std::pair<std::pair<Addr, Addr>, uint32_t> front = pushReqQueue.front();
+    std::tie(start_addr, end_addr) = front.first;
+    update_value = front.second;
+
+    Addr req_addr = (start_addr / 64) * 64;
+    Addr req_offset = start_addr % 64;
+    int num_edges = 0;
+
+    if (end_addr > req_addr + 64) {
+        num_edges = (req_addr + 64 - start_addr) / sizeof(Edge);
+    } else {
+        num_edges = (end_addr - start_addr) / sizeof(Edge);
     }
+    PacketPtr pkt = createReadPacket(req_addr, 64);
+    reqOffsetMap[pkt->req] = req_offset;
+    reqNumEdgeMap[pkt->req] = num_edges;
+    reqValueMap[pkt->req] = update_value;
+    pendingReadReqs.push_back(pkt);
 
     pushReqQueue.pop_front();
 
+    if (req_addr + 64 < end_addr) {
+        pushReqQueue.push_front(
+        std::make_pair(std::make_pair(req_addr + 64, end_addr), update_value)
+        );
+    }
+
     if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
         schedule(nextAddrGenEvent, nextCycle());
     }
@@ -207,26 +205,30 @@ PushEngine::processNextPushEvent()
             __func__, pkt->getAddr());
 
     Addr offset = reqOffsetMap[req];
-    int num_edges = reqNumEdgeMap[req];
     uint32_t value = reqValueMap[req];
 
-    for (int i = 0; i < num_edges; i++) {
-        uint8_t *curr_edge_data = data + offset + (i * sizeof(Edge));
-        Edge* e = (Edge*) (curr_edge_data);
-        DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
-        int data_size = sizeof(uint32_t) / sizeof(uint8_t);
-        uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
-        // TODO: Implement propagate function here
-        *update_data = value + 1;
-        DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
-                __func__, e->neighbor, *update_data);
-        PacketPtr update = createUpdatePacket(e->neighbor,
-            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
-
-        if (sendPushUpdate(update) && (i == num_edges - 1)) {
-            memRespQueue.pop_front();
-            // TODO: Erase map entries here.
-        }
+    Edge* e = (Edge*) (data + offset);
+    DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
+    int data_size = sizeof(uint32_t) / sizeof(uint8_t);
+    uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
+    // TODO: Implement propagate function here
+    *update_data = value + 1;
+    // uint32_t update_value = value + 1;
+    DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
+            __func__, e->neighbor, *update_data);
+    PacketPtr update = createUpdatePacket(e->neighbor,
+                        sizeof(uint32_t), (uint8_t*) update_data);
+
+    if (sendPushUpdate(update)) {
+        reqOffsetMap[req] = reqOffsetMap[req] + sizeof(Edge);
+        reqNumEdgeMap[req]--;
+    }
+
+    if (reqNumEdgeMap[req] == 0) {
+        memRespQueue.pop_front();
+        reqOffsetMap.erase(req);
+        reqNumEdgeMap.erase(req);
+        reqValueMap.erase(req);
     }
 
     if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
@@ -235,7 +237,8 @@ PushEngine::processNextPushEvent()
 }
 
 PacketPtr
-PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
+PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data)
+// PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
     // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
@@ -247,6 +250,7 @@ PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
 
     pkt->allocate();
     pkt->setData(data);
+    // pkt->setLE<uint32_t>(value);
 
     return pkt;
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 4c9822345f..faee5128b7 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -64,8 +64,9 @@ class PushEngine : public BaseReadEngine
     Addr baseEdgeAddr;
 
     int pushReqQueueSize;
-    std::deque<WorkListItem> pushReqQueue;
+    std::deque<std::pair<std::pair<Addr, Addr>, uint32_t>> pushReqQueue;
 
+    // TODO: Add size one size for all these maps
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
@@ -79,7 +80,8 @@ class PushEngine : public BaseReadEngine
 
     virtual void startup();
 
-    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
+    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t* data);
+    // PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint32_t value);
 
     bool sendPushUpdate(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 73eacf945f..117abb61e8 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -106,17 +106,18 @@ WLEngine::processNextReadEvent()
     uint32_t* update_value = update->getPtr<uint32_t>();
 
     // FIXME: else logic is wrong
-    if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) &&
-        (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) {
-        if (coalesceEngine->recvReadAddr(update_addr)) {
-            DPRINTF(MPU, "%s: Received an update and it's not been pulled in. "
-                            "update_addr: %lu, update_value: %u.\n",
-                            __func__, update_addr, *update_value);
-            onTheFlyUpdateMap[update_addr] = *update_value;
-            DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
-                __func__, update_addr, onTheFlyUpdateMap[update_addr]);
-            updateQueue.pop_front();
-            DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
+    if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
+        if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
+            if (coalesceEngine->recvReadAddr(update_addr)) {
+                DPRINTF(MPU, "%s: Received an update and it's not been pulled in. "
+                                "update_addr: %lu, update_value: %u.\n",
+                                __func__, update_addr, *update_value);
+                onTheFlyUpdateMap[update_addr] = *update_value;
+                DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
+                    __func__, update_addr, onTheFlyUpdateMap[update_addr]);
+                updateQueue.pop_front();
+                DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
+            }
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min

From cf001ea9840f11ad2d78fa73c83cb5100039819a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 3 Apr 2022 15:39:56 -0700
Subject: [PATCH 080/247] Updating createUpdatePacket.

---
 src/accl/graph/TODO.md                 |  8 ++++++++
 src/accl/graph/sega/coalesce_engine.cc | 17 ++++-------------
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/push_engine.cc     | 26 +++++++++++++-------------
 src/accl/graph/sega/push_engine.hh     |  4 ++--
 src/accl/graph/sega/wl_engine.cc       | 14 ++++++++------
 6 files changed, 36 insertions(+), 34 deletions(-)
 create mode 100644 src/accl/graph/TODO.md

diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
new file mode 100644
index 0000000000..d5effbeb96
--- /dev/null
+++ b/src/accl/graph/TODO.md
@@ -0,0 +1,8 @@
+# TODO Items
+
+* use setLE/setBE inside createUpdatePacket and createWritePacket
+* parameterize cache size, associativity, maybe latencies,
+and memory atom size in the coalesce engine
+* look at all the simobjects and come up with a general architecture. Make
+sure all the simobjects follow that architecture.
+* implement all the communications between simobjects as req/retry.
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 9fed1e8230..8d97fffd20 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -30,6 +30,7 @@
 
 #include "accl/graph/sega/wl_engine.hh"
 #include "debug/MPU.hh"
+#include "mem/packet_access.hh"
 
 namespace gem5
 {
@@ -300,19 +301,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     // TODO: Make this more general and programmable.
     // && (cacheBlocks[block_index].hasConflict)
-    bool found = false;
     if ((cacheBlocks[block_index].takenMask == 0)) {
-        for (auto index : evictQueue) {
-            if (block_index == index) {
-                found = true;
-                break;
-            }
-        }
-        if (!found) {
-            evictQueue.push_back(block_index);
-        }
-        DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
-                __func__, evictQueue.size());
+        evictQueue.push_back(block_index);
     }
 
     if ((!nextApplyAndCommitEvent.scheduled()) &&
@@ -328,6 +318,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
     int block_index = evictQueue.front();
     uint8_t changedMask = 0;
     // TODO: parameterize 64 to memory atom size
+    uint8_t* wl_data;
     uint8_t data[64];
 
     for (int i = 0; i < 4; i++) {
@@ -341,7 +332,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
         DPRINTF(MPU, "%s: Writing WorkListItem[%lu[%d]] to memory. "
                     "WLItem: %s.\n", __func__, cacheBlocks[block_index].addr,
                     i, cacheBlocks[block_index].items[i].to_string());
-        uint8_t* wl_data = (uint8_t*) (cacheBlocks[block_index].items + i);
+        wl_data = (uint8_t*) (cacheBlocks[block_index].items + i);
         std::memcpy(data + (i * sizeof(WorkListItem)),
                     wl_data, sizeof(WorkListItem));
     }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index ff30efde4c..5c4e752cbf 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -84,6 +84,7 @@ class CoalesceEngine : public BaseReadEngine
     virtual void startup();
 
     PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
+    // PacketPtr createWritePacket(Addr addr, unsigned int size, WorkListItem wl);
 
     EventFunctionWrapper nextMemReqEvent;
     void processNextMemReqEvent();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 0b4c981d48..870b32f2fb 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/push_engine.hh"
 
 #include "debug/MPU.hh"
+#include "mem/packet_access.hh"
 
 namespace gem5
 {
@@ -64,8 +65,8 @@ PushEngine::startup()
     uint32_t* tempPtr = (uint32_t*) first_update_data;
     *tempPtr = 0;
 
-    PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
-    // PacketPtr first_update = createUpdatePacket(0, 4, (uint32_t) 0);
+    // PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
+    PacketPtr first_update = createUpdatePacket(0, 4, (uint32_t) 0);
 
     sendPushUpdate(first_update);
 }
@@ -193,7 +194,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
-// FIXME: FIX THIS FUNCTION FOR TIMING AND FUNCTIONAL ACCURACY.
+// TODO: Add a parameter to allow for doing multiple pushes at the same time.
 void
 PushEngine::processNextPushEvent()
 {
@@ -209,15 +210,14 @@ PushEngine::processNextPushEvent()
 
     Edge* e = (Edge*) (data + offset);
     DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
-    int data_size = sizeof(uint32_t) / sizeof(uint8_t);
-    uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
+
     // TODO: Implement propagate function here
-    *update_data = value + 1;
-    // uint32_t update_value = value + 1;
+    uint32_t update_value = value + 1;
     DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
-            __func__, e->neighbor, *update_data);
+            __func__, e->neighbor, update_value);
+
     PacketPtr update = createUpdatePacket(e->neighbor,
-                        sizeof(uint32_t), (uint8_t*) update_data);
+                        sizeof(uint32_t), update_value);
 
     if (sendPushUpdate(update)) {
         reqOffsetMap[req] = reqOffsetMap[req] + sizeof(Edge);
@@ -237,8 +237,8 @@ PushEngine::processNextPushEvent()
 }
 
 PacketPtr
-PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data)
-// PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value)
+// PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data)
+PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
     // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
@@ -249,8 +249,8 @@ PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data)
     PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
 
     pkt->allocate();
-    pkt->setData(data);
-    // pkt->setLE<uint32_t>(value);
+    // pkt->setData(data);
+    pkt->setLE<uint32_t>(value);
 
     return pkt;
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index faee5128b7..a539079ede 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -80,8 +80,8 @@ class PushEngine : public BaseReadEngine
 
     virtual void startup();
 
-    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t* data);
-    // PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint32_t value);
+    // PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t* data);
+    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint32_t value);
 
     bool sendPushUpdate(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 117abb61e8..3a6911c1bf 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -27,7 +27,9 @@
  */
 
 #include "accl/graph/sega/wl_engine.hh"
+
 #include "debug/MPU.hh"
+#include "mem/packet_access.hh"
 
 namespace gem5
 {
@@ -103,7 +105,7 @@ WLEngine::processNextReadEvent()
 {
     PacketPtr update = updateQueue.front();
     Addr update_addr = update->getAddr();
-    uint32_t* update_value = update->getPtr<uint32_t>();
+    uint32_t update_value = update->getLE<uint32_t>();
 
     // FIXME: else logic is wrong
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
@@ -111,8 +113,8 @@ WLEngine::processNextReadEvent()
             if (coalesceEngine->recvReadAddr(update_addr)) {
                 DPRINTF(MPU, "%s: Received an update and it's not been pulled in. "
                                 "update_addr: %lu, update_value: %u.\n",
-                                __func__, update_addr, *update_value);
-                onTheFlyUpdateMap[update_addr] = *update_value;
+                                __func__, update_addr, update_value);
+                onTheFlyUpdateMap[update_addr] = update_value;
                 DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
                     __func__, update_addr, onTheFlyUpdateMap[update_addr]);
                 updateQueue.pop_front();
@@ -123,10 +125,10 @@ WLEngine::processNextReadEvent()
         // TODO: Generalize this to reduce function rather than just min
         DPRINTF(MPU, "%s: Hitting in the onTheFlyUpdateMap."
                             "update_addr: %lu, update_value: %u, old_value: %u.\n",
-                            __func__, update_addr, *update_value,
+                            __func__, update_addr, update_value,
                             onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
-                std::min(*update_value, onTheFlyUpdateMap[update_addr]);
+                std::min(update_value, onTheFlyUpdateMap[update_addr]);
         stats.onTheFlyCoalesce++;
         updateQueue.pop_front();
         DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
@@ -154,7 +156,6 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 void
 WLEngine::processNextReduceEvent()
 {
-
     std::unordered_map<Addr, WorkListItem>::iterator it =
                     addrWorkListMap.begin();
 
@@ -190,6 +191,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     }
 
     updateQueue.push_back(pkt);
+
     assert(!updateQueue.empty());
     DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
     if (!nextReadEvent.scheduled()) {

From c405e30aacbebd410d24fc83924f9769ea8e74f9 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 3 Apr 2022 17:26:15 -0700
Subject: [PATCH 081/247] Adding retry to wle respPort and debug.

---
 src/accl/graph/sega/push_engine.cc | 13 +++++++++----
 src/accl/graph/sega/wl_engine.cc   | 31 +++++++++++++++++++++++++-----
 src/accl/graph/sega/wl_engine.hh   |  3 +++
 3 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 870b32f2fb..70d6242f5b 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -95,10 +95,12 @@ PushEngine::ReqPort::recvReqRetry()
 {
     panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
 
+    DPRINTF(MPU, "%s: Received a reqRetry.\n", __func__);
+
     _blocked = false;
     sendPacket(blockedPacket);
 
-    if (!blocked()) {
+    if (!_blocked) {
         blockedPacket = nullptr;
     }
 }
@@ -202,12 +204,13 @@ PushEngine::processNextPushEvent()
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
 
-    DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu.\n",
-            __func__, pkt->getAddr());
-
     Addr offset = reqOffsetMap[req];
     uint32_t value = reqValueMap[req];
 
+    DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
+                "offset: %lu\n",
+            __func__, pkt->getAddr(), offset);
+
     Edge* e = (Edge*) (data + offset);
     DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
 
@@ -220,6 +223,8 @@ PushEngine::processNextPushEvent()
                         sizeof(uint32_t), update_value);
 
     if (sendPushUpdate(update)) {
+        DPRINTF(MPU, "%s: Send a push update to addr: %lu with value: %d.\n",
+                    __func__, e->neighbor, update_value);
         reqOffsetMap[req] = reqOffsetMap[req] + sizeof(Edge);
         reqNumEdgeMap[req]--;
     }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 3a6911c1bf..27c7ad4fea 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -64,10 +64,25 @@ WLEngine::RespPort::getAddrRanges() const
     return owner->getAddrRanges();
 }
 
+void
+WLEngine::RespPort::checkRetryReq()
+{
+    if (needSendRetryReq) {
+        DPRINTF(MPU, "%s: Sending a reqRetry.\n", __func__);
+        sendRetryReq();
+        needSendRetryReq = false;
+    }
+}
+
 bool
 WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
 {
-    return owner->handleIncomingUpdate(pkt);
+    if (!owner->handleIncomingUpdate(pkt)) {
+        needSendRetryReq = true;
+        return false;
+    }
+
+    return true;
 }
 
 Tick
@@ -107,7 +122,6 @@ WLEngine::processNextReadEvent()
     Addr update_addr = update->getAddr();
     uint32_t update_value = update->getLE<uint32_t>();
 
-    // FIXME: else logic is wrong
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
         if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
             if (coalesceEngine->recvReadAddr(update_addr)) {
@@ -118,7 +132,11 @@ WLEngine::processNextReadEvent()
                 DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
                     __func__, update_addr, onTheFlyUpdateMap[update_addr]);
                 updateQueue.pop_front();
-                DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
+                DPRINTF(MPU, "%s: 0: updateQueue.size: %d.\n", __func__, updateQueue.size());
+                if (updateQueue.size() == updateQueueSize - 1) {
+                    respPort.checkRetryReq();
+                }
+
             }
         }
     } else {
@@ -131,8 +149,10 @@ WLEngine::processNextReadEvent()
                 std::min(update_value, onTheFlyUpdateMap[update_addr]);
         stats.onTheFlyCoalesce++;
         updateQueue.pop_front();
-        DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
-        // TODO: Add a stat to count the number of coalescions
+        DPRINTF(MPU, "%s: 1: updateQueue.size: %d.\n", __func__, updateQueue.size());
+        if (updateQueue.size() == updateQueueSize - 1) {
+            respPort.checkRetryReq();
+        }
     }
 
     // TODO: Only schedule nextReadEvent only when it has to be scheduled
@@ -180,6 +200,7 @@ WLEngine::processNextReduceEvent()
     for (int i = 0; i < servicedAddresses.size(); i++) {
         onTheFlyUpdateMap.erase(servicedAddresses[i]);
     }
+    DPRINTF(MPU, "%s: onTheFlyUpdateMap.size(): %u, servicedAddresses.size(): %u.\n", __func__, onTheFlyUpdateMap.size(), servicedAddresses.size());
 }
 
 bool
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index a8dff32d44..476c9be932 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -48,6 +48,7 @@ class WLEngine : public BaseReduceEngine
     {
       private:
         WLEngine* owner;
+        bool needSendRetryReq;
 
       public:
         RespPort(const std::string& name, WLEngine* owner):
@@ -55,6 +56,8 @@ class WLEngine : public BaseReduceEngine
         {}
         virtual AddrRangeList getAddrRanges() const;
 
+        void checkRetryReq();
+
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);
         virtual Tick recvAtomic(PacketPtr pkt);

From f43564614cbf10d78bb23122e2242e657776ebef Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 5 Apr 2022 09:20:52 -0700
Subject: [PATCH 082/247] Debugging coalesce engine deadlock.

---
 src/accl/graph/base/data_structs.hh    |   8 +-
 src/accl/graph/sega/coalesce_engine.cc | 247 ++++++++++++++++++++-----
 src/accl/graph/sega/coalesce_engine.hh |   2 +
 src/accl/graph/sega/push_engine.cc     |   2 +-
 src/accl/graph/sega/wl_engine.cc       |  71 ++++---
 5 files changed, 254 insertions(+), 76 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index b51a9f0781..dacb74e38c 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -70,10 +70,10 @@ struct __attribute__ ((packed)) Edge
     uint16_t weight : 16;
     uint64_t neighbor : 48;
 
-    std::string to_string()
-    {
-        return csprintf("Edge{weight: %lu, neighbor: %lu}", weight, neighbor);
-    }
+    // std::string to_string()
+    // {
+    //     return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor);
+    // }
 
     Edge(uint16_t weight, uint64_t neighbor):
         weight(weight),
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 8d97fffd20..d7fa806fff 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -41,6 +41,8 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
+    alarmRequested(false),
+    spaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
@@ -77,17 +79,21 @@ CoalesceEngine::recvReadAddr(Addr addr)
     DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
     Addr aligned_addr = (addr / 64) * 64;
-    int block_index = aligned_addr % 256;
+    int block_index = (aligned_addr / 64) % 256;
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
-        DPRINTF(MPU, "%s: Read request with addr: %lu hit in the cache.\n"
-                        , __func__, addr);
         // TODO: Make addrQueue and wlQueue into one std::pair<Addr, WL>
         addrResponseQueue.push_back(addr);
-        worklistResponseQueue.push_back(cacheBlocks[block_index].items[wl_offset]);
+        worklistResponseQueue.push_back(
+            cacheBlocks[block_index].items[wl_offset]);
+        DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
+            "to worklistResponseQueue. worklistResponseQueue.size = %d.\n",
+            __func__, addr, block_index, wl_offset,
+            worklistResponseQueue.size(),
+            cacheBlocks[block_index].items[wl_offset].to_string());
         // TODO: Use a bitset instead of unsigned int for takenMask
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
 
@@ -101,50 +107,72 @@ CoalesceEngine::recvReadAddr(Addr addr)
         return true;
     } else {
         // miss
+        DPRINTF(MPU, "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHRMap.find(block_index) == MSHRMap.end()) {
+            DPRINTF(MPU, "%s: Respective cache line[%d] for Addr: %lu not "
+                        "found in MSHRs.\n", __func__, block_index, addr);
             assert(MSHRMap.size() <= numMSHREntry);
             if (MSHRMap.size() == numMSHREntry) {
                 // Out of MSHR entries
+                DPRINTF(MPU, "%s: Out of MSHR entries. "
+                            "Rejecting request.\n", __func__);
                 return false;
             } else {
+                DPRINTF(MPU, "%s: MSHR entries available.\n", __func__);
                 if (cacheBlocks[block_index].allocated) {
                     assert(MSHRMap[block_index].size() <= numTgtsPerMSHR);
+                    DPRINTF(MPU, "%s: Addr: %lu has a conflict "
+                                "with Addr: %lu.\n", __func__, addr,
+                                cacheBlocks[block_index].addr);
                     if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
+                        DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
+                                    "Rejecting request.\n",
+                                    __func__, block_index);
                         return false;
                     }
-                    // MSHR available but conflict
-                    DPRINTF(MPU, "%s: Read request with addr: %lu missed with "
-                                "conflict. Making a request for "
-                                "aligned_addr: %lu.\n",
-                                __func__, addr, aligned_addr);
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
+                    DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+                                "line[%d]", __func__, addr, block_index);
                     return true;
                 } else {
                     // TODO: Set valid to false every deallocation and
-                    // assert valid == false here.
+                    assert(!cacheBlocks[block_index].valid);
                     // MSHR available and no conflict
                     assert(
                         outstandingMemReqQueue.size() <=
                         outstandingMemReqQueueSize);
+                    DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to "
+                                "allocate a cache line for it.\n",
+                                __func__, addr);
                     if (outstandingMemReqQueue.size() ==
                         outstandingMemReqQueueSize) {
+                        DPRINTF(MPU, "%s: No space in outstandingMemReqQueue "
+                                    "(outstandingMemReqQueue.size: %u). "
+                                    "Rejecting  request.\n", __func__,
+                                    outstandingMemReqQueue.size());
                         return false;
                     }
-                    DPRINTF(MPU, "%s: Read request with addr: "
-                                "%lu missed with no conflict. "
-                                "Making a request for aligned_addr: %lu.\n"
-                                , __func__, addr, aligned_addr);
                     cacheBlocks[block_index].addr = aligned_addr;
                     cacheBlocks[block_index].takenMask = 0;
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
+                    DPRINTF(MPU, "%s: Allocated cache line[%d] for "
+                                "Addr: %lu.\n", __func__, block_index, addr);
 
                     MSHRMap[block_index].push_back(addr);
+                    DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+                                "line[%d].\n", __func__, addr, block_index);
                     // TODO: Parameterize 64 to memory atom size
                     PacketPtr pkt = createReadPacket(aligned_addr, 64);
+                    DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                                " req addr (aligned_addr) = %lu, size = 64.\n",
+                                __func__, addr, aligned_addr);
                     outstandingMemReqQueue.push_back(pkt);
+                    DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue. "
+                                "outstandingMemReqQueue.size = %d", __func__,
+                                outstandingMemReqQueue.size());
 
                     stats.numVertexBlockReads++;
 
@@ -156,14 +184,24 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 }
             }
         } else {
+            DPRINTF(MPU, "%s: Respective cache line[%d] for Addr: %lu already "
+                        "in MSHRs.\n", __func__, block_index, addr);
             if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
+                DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
+                            "Rejecting request.\n",
+                            __func__, block_index);
                 return false;
             }
             if ((!cacheBlocks[block_index].hasConflict) &&
                 (aligned_addr != cacheBlocks[block_index].addr)) {
+                DPRINTF(MPU, "%s: Addr: %lu has a conflict "
+                            "with Addr: %lu.\n", __func__, addr,
+                            cacheBlocks[block_index].addr);
                 cacheBlocks[block_index].hasConflict = true;
             }
             MSHRMap[block_index].push_back(addr);
+            DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+                            "line[%d].\n", __func__, addr, block_index);
             return true;
         }
     }
@@ -176,9 +214,24 @@ CoalesceEngine::processNextMemReqEvent()
 
     if (!memPortBlocked()) {
         sendMemReq(pkt);
+        DPRINTF(MPU, "%s: Sent a memory request to Addr: %lu, MemCmd: %s.\n",
+                __func__, pkt->getAddr(), pkt->isRead() ? "Read" : "Write");
         outstandingMemReqQueue.pop_front();
+        DPRINTF(MPU, "%s: Popped a packet from outstandingMemReqQueue. "
+                    "outstandingMemReqQueue.size = %u.\n", __func__,
+                    outstandingMemReqQueue.size());
     }
 
+    if ((alarmRequested) &&
+        (outstandingMemReqQueue.size() <
+        (outstandingMemReqQueueSize - spaceRequested))) {
+        alarmRequested = false;
+        spaceRequested = 0;
+        schedule(nextApplyAndCommitEvent, nextCycle());
+        DPRINTF(MPU, "%s: There is an alarm request for "
+        "nextApplyAndCommitEvent. Reset alarm parameters and scheduled "
+        "nextApplyAndCommitEvent.\n", __func__);
+    }
     if ((!nextMemReqEvent.scheduled()) &&
         (!outstandingMemReqQueue.empty())) {
         schedule(nextMemReqEvent, nextCycle());
@@ -192,9 +245,14 @@ CoalesceEngine::processNextRespondEvent()
     WorkListItem worklist_response = worklistResponseQueue.front();
 
     peerWLEngine->handleIncomingWL(addr_response, worklist_response);
+    DPRINTF(MPU, "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n",
+                __func__, worklist_response.to_string(), addr_response);
 
     addrResponseQueue.pop_front();
     worklistResponseQueue.pop_front();
+    DPRINTF(MPU, "%s: Popped a response from worklistResponseQueue. "
+                "worklistResponseQueue.size = %d.\n", __func__,
+                worklistResponseQueue.size());
 
     if ((!nextRespondEvent.scheduled()) &&
         (!worklistResponseQueue.empty()) &&
@@ -208,15 +266,20 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
     assert(pkt->isResponse());
     if (pkt->isWrite()) {
+        DPRINTF(MPU, "%s: Received a write response for Addr: %lu. Dropping "
+                    "the packet.\n", __func__, pkt->getAddr());
         return true;
     }
 
     Addr addr = pkt->getAddr();
     uint8_t* data = pkt->getPtr<uint8_t>();
-    int block_index = addr % 256; // TODO: After parameterizing the cache size
-                                  // this 256 number should change to the cache
-                                  // size parameter.
+    // TODO: After parameterizing the cache size
+    // this 256 number should change to the cache
+    // size parameter.
+    int block_index = (addr / 64) % 256;
 
+    DPRINTF(MPU, "%s: Received a read resposne for Addr: %lu.\n",
+                __func__, pkt->getAddr());
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
             (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
@@ -224,6 +287,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     for (int i = 0; i < 4; i++) {
         cacheBlocks[block_index].items[i] = *((WorkListItem*) (
                                 data + (i * sizeof(WorkListItem))));
+        DPRINTF(MPU, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
+                block_index, i, cacheBlocks[block_index].items[i].to_string());
     }
     cacheBlocks[block_index].valid = true;
 
@@ -231,29 +296,42 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHRMap[block_index].size(); i++) {
         Addr miss_addr = MSHRMap[block_index][i];
-        Addr alligned_miss_addr = (miss_addr / 64) * 64;
+        Addr aligned_miss_addr = (miss_addr / 64) * 64;
 
-        if (alligned_miss_addr == addr) {
-            int wl_offset = (miss_addr - alligned_miss_addr) / 16;
+        if (aligned_miss_addr == addr) {
+            int wl_offset = (miss_addr - aligned_miss_addr) / 16;
+            DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
+                        "be serviced with the received packet.\n",
+                        __func__, miss_addr, block_index);
             addrResponseQueue.push_back(miss_addr);
             worklistResponseQueue.push_back(
                 cacheBlocks[block_index].items[wl_offset]);
+            DPRINTF(MPU, "%s: Pushed cache line[%d][%d] to "
+                    "worklistResponseQueue. worklistResponseQueue.size = %u.\n"
+                    , __func__, block_index, wl_offset,
+                    worklistResponseQueue.size());
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
             stats.numVertexReads++;
             servicedIndices.push_back(i);
+            DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
+                        "removal.\n", __func__, i, block_index);
         }
     }
     // TODO: We Can use taken instead of this
     for (int i = 0; i < servicedIndices.size(); i++) {
+        Addr print_addr = MSHRMap[block_index][i - bias];
         MSHRMap[block_index].erase(MSHRMap[block_index].begin() +
                                     servicedIndices[i] - bias);
         bias++;
+        DPRINTF(MPU, "%s: Addr: %lu has been serviced and is removed.\n",
+                    __func__, print_addr);
     }
 
     if (MSHRMap[block_index].empty()) {
         MSHRMap.erase(block_index);
         cacheBlocks[block_index].hasConflict = false;
     } else {
+        // TODO: I think this is unnecessary.
         cacheBlocks[block_index].hasConflict = true;
     }
 
@@ -286,27 +364,33 @@ void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
     Addr aligned_addr = (addr / 64) * 64;
-    int block_index = aligned_addr % 256;
+    int block_index = (aligned_addr / 64) % 256;
     int wl_offset = (addr - aligned_addr) / 16;
-    DPRINTF(MPU, "%s: Recieved a WorkList write. addr: %lu, wl: %s.\n",
-                                    __func__, addr, wl.to_string());
-    DPRINTF(MPU, "%s: aligned_addr: %lu, block_index: %d, wl_offset: %d, "
-            "takenMask: %u.\n", __func__, aligned_addr,
-            block_index, wl_offset, cacheBlocks[block_index].takenMask);
+
+    DPRINTF(MPU, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
+                __func__, wl.to_string(), addr);
     assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) ==
             (1 << wl_offset));
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
     stats.numVertexWrites++;
+    DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__,
+                cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
     // && (cacheBlocks[block_index].hasConflict)
     if ((cacheBlocks[block_index].takenMask == 0)) {
+        DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
+                    " It does not have any taken items anymore.\n",
+                    __func__, block_index);
         evictQueue.push_back(block_index);
+        DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
+                    __func__, block_index, evictQueue.size());
     }
 
     if ((!nextApplyAndCommitEvent.scheduled()) &&
-        (!evictQueue.empty())) {
+        (!evictQueue.empty())&&
+        ((!alarmRequested) && (spaceRequested == 0))) {
         schedule(nextApplyAndCommitEvent, nextCycle());
     }
 
@@ -315,90 +399,163 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextApplyAndCommitEvent()
 {
+    assert((!alarmRequested) && (spaceRequested == 0));
     int block_index = evictQueue.front();
     uint8_t changedMask = 0;
     // TODO: parameterize 64 to memory atom size
     uint8_t* wl_data;
     uint8_t data[64];
 
+    DPRINTF(MPU, "%s: Received nextApplyAndCommitEvent for cache line[%d].\n",
+                __func__, block_index);
+    DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and "
+                "then commited.\n", __func__, block_index);
+
+    if ((cacheBlocks[block_index].hasConflict) &&
+        (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)) {
+        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                    __func__, block_index);
+
+    } else if ((!cacheBlocks[block_index].hasConflict) &&
+            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
+        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                    __func__, block_index);
+    } else {
+        alarmRequested = true;
+        spaceRequested = cacheBlocks[block_index].hasConflict ? 2 : 1;
+        DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set "
+        "an alarm for nextApplyAndCommitEvent when space = %d.\n",
+        __func__, spaceRequested);
+        return;
+    }
+
     for (int i = 0; i < 4; i++) {
         uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
         cacheBlocks[block_index].items[i].prop = std::min(
             cacheBlocks[block_index].items[i].prop,
             cacheBlocks[block_index].items[i].tempProp);
+        DPRINTF(MPU, "%s: Applied cache line[%d][%d] = %s.\n", __func__,
+                    block_index, i,
+                    cacheBlocks[block_index].items[i].to_string());
         if (old_prop != cacheBlocks[block_index].items[i].prop) {
             changedMask |= (1 << i);
+            DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
+                        __func__, block_index, i);
         }
-        DPRINTF(MPU, "%s: Writing WorkListItem[%lu[%d]] to memory. "
-                    "WLItem: %s.\n", __func__, cacheBlocks[block_index].addr,
-                    i, cacheBlocks[block_index].items[i].to_string());
         wl_data = (uint8_t*) (cacheBlocks[block_index].items + i);
         std::memcpy(data + (i * sizeof(WorkListItem)),
                     wl_data, sizeof(WorkListItem));
     }
 
     if (changedMask) {
+        DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
+                    , __func__, block_index);
         assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
         PacketPtr write_pkt = createWritePacket(
             cacheBlocks[block_index].addr, 64, data);
-
-        if ((cacheBlocks[block_index].hasConflict) &&
-            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)){
+        DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n",
+                    __func__, write_pkt->getAddr());
+        if (cacheBlocks[block_index].hasConflict) {
+            assert(
+                outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1
+            );
+            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
+                        "enough space in outstandingMemReqQueue for the write "
+                        "back packet and its subsequent read packet.\n",
+                        __func__, block_index);
             Addr miss_addr = MSHRMap[block_index][0];
-            // TODO: Make sure this trick works;
-            Addr alligned_miss_addr = (miss_addr / 64) * 64;
-            PacketPtr read_pkt = createReadPacket(alligned_miss_addr, 64);
+            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
+                        " Addr: %lu.\n", __func__, block_index, miss_addr);
+            // TODO: parameterize 64
+            Addr aligned_miss_addr = (miss_addr / 64) * 64;
+            PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                        " req addr (aligned_addr) = %lu, size = 64.\n",
+                        __func__, miss_addr, aligned_miss_addr);
             outstandingMemReqQueue.push_back(write_pkt);
             outstandingMemReqQueue.push_back(read_pkt);
+            DPRINTF(MPU, "%s: Added the evicting write back packet along with "
+                        "its subsequent read packet (to service the conflicts)"
+                        " to outstandingMemReqQueue. "
+                        "outstandingMemReqQueue.size = %u.\n", __func__,
+                        outstandingMemReqQueue.size());
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 0);
             }
             if ((changedMask & (2)) == 2) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 1);
             }
             if ((changedMask & (4)) == 4) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 2);
             }
             if ((changedMask & (8)) == 8) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 3);
             }
             cacheBlocks[block_index].takenMask = 0;
             cacheBlocks[block_index].allocated = true;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = true;
             evictQueue.pop_front();
-            DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
-                __func__, evictQueue.size());
-        } else if ((!cacheBlocks[block_index].hasConflict) &&
-            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
+            DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                        " = %u.\n", __func__, evictQueue.size());
+        } else {
+            assert(outstandingMemReqQueue.size() < outstandingMemReqQueueSize);
+            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
+                    "enough space in outstandingMemReqQueue for the write back"
+                    " packet.\n", __func__, block_index);
             outstandingMemReqQueue.push_back(write_pkt);
+            DPRINTF(MPU, "%s: Added the write back packet to "
+                        "outstandingMemReqQueue. oustandingMemReqQueue.size = "
+                        "%u.\n", __func__, outstandingMemReqQueue.size());
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 0);
             }
             if ((changedMask & (2)) == 2) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 1);
             }
             if ((changedMask & (4)) == 4) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 2);
             }
             if ((changedMask & (8)) == 8) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 3);
             }
             cacheBlocks[block_index].takenMask = 0;
             cacheBlocks[block_index].allocated = false;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = false;
             evictQueue.pop_front();
-            DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
-                __func__, evictQueue.size());
-        } else {
-            DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" ,
-                __func__);
+            DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                        " = %u.\n", __func__, evictQueue.size());
         }
     } else {
+        cacheBlocks[block_index].takenMask = 0;
+        cacheBlocks[block_index].allocated = false;
+        cacheBlocks[block_index].valid = false;
+        cacheBlocks[block_index].hasConflict = false;
+        DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write "
+                    "backs are necessary. Deallocated cache line[%d].\n",
+                    __func__, block_index, block_index);
         evictQueue.pop_front();
+        DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                        " = %u.\n", __func__, evictQueue.size());
     }
 
     if ((!nextMemReqEvent.scheduled()) &&
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 5c4e752cbf..902a960301 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -74,6 +74,8 @@ class CoalesceEngine : public BaseReadEngine
     std::unordered_map<int, std::vector<Addr>> MSHRMap;
 
     int outstandingMemReqQueueSize;
+    bool alarmRequested;
+    int spaceRequested;
     std::deque<PacketPtr> outstandingMemReqQueue;
 
     std::deque<Addr> addrResponseQueue;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 70d6242f5b..c9ed781d79 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -212,7 +212,7 @@ PushEngine::processNextPushEvent()
             __func__, pkt->getAddr(), offset);
 
     Edge* e = (Edge*) (data + offset);
-    DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
+    // DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
 
     // TODO: Implement propagate function here
     uint32_t update_value = value + 1;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 27c7ad4fea..ea45cae652 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -68,7 +68,7 @@ void
 WLEngine::RespPort::checkRetryReq()
 {
     if (needSendRetryReq) {
-        DPRINTF(MPU, "%s: Sending a reqRetry.\n", __func__);
+        DPRINTF(MPU, "%s: Sending a RetryReq.\n", __func__);
         sendRetryReq();
         needSendRetryReq = false;
     }
@@ -121,43 +121,49 @@ WLEngine::processNextReadEvent()
     PacketPtr update = updateQueue.front();
     Addr update_addr = update->getAddr();
     uint32_t update_value = update->getLE<uint32_t>();
+    DPRINTF(MPU, "%s: Looking at the front of the updateQueue. Addr: %lu, "
+                "value: %u.\n", __func__, update_addr, update_value);
 
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
+        DPRINTF(MPU, "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
+                    __func__, update_addr);
         if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
             if (coalesceEngine->recvReadAddr(update_addr)) {
-                DPRINTF(MPU, "%s: Received an update and it's not been pulled in. "
-                                "update_addr: %lu, update_value: %u.\n",
-                                __func__, update_addr, update_value);
                 onTheFlyUpdateMap[update_addr] = update_value;
-                DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
-                    __func__, update_addr, onTheFlyUpdateMap[update_addr]);
+                DPRINTF(MPU, "%s: Added a new item to onTheFlyUpdateMap. "
+                            "onTheFlyUpdateMap[%lu] = %u.\n", __func__,
+                            update_addr, onTheFlyUpdateMap[update_addr]);
                 updateQueue.pop_front();
-                DPRINTF(MPU, "%s: 0: updateQueue.size: %d.\n", __func__, updateQueue.size());
+                DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
+                            ". updateQueue.size = %u.\n",
+                            __func__, updateQueue.size());
                 if (updateQueue.size() == updateQueueSize - 1) {
                     respPort.checkRetryReq();
                 }
-
             }
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
-        DPRINTF(MPU, "%s: Hitting in the onTheFlyUpdateMap."
-                            "update_addr: %lu, update_value: %u, old_value: %u.\n",
-                            __func__, update_addr, update_value,
-                            onTheFlyUpdateMap[update_addr]);
+        DPRINTF(MPU, "%s: Found the addr: %lu in onTheFlyUpdateMap. "
+                    "onTheFlyUpdateMap[%lu] = %u.", __func__, update_addr,
+                    update_addr, onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
                 std::min(update_value, onTheFlyUpdateMap[update_addr]);
+        DPRINTF(MPU, "%s: Reduced the update_value with the entry in "
+                    "onTheFlyUpdateMap. onTheFlyUpdateMap[%lu] = %u.\n",
+                    __func__, update_addr, onTheFlyUpdateMap[update_addr]);
         stats.onTheFlyCoalesce++;
         updateQueue.pop_front();
-        DPRINTF(MPU, "%s: 1: updateQueue.size: %d.\n", __func__, updateQueue.size());
+        DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
+                                        ". updateQueue.size = %u.\n",
+                                        __func__, updateQueue.size());
         if (updateQueue.size() == updateQueueSize - 1) {
             respPort.checkRetryReq();
         }
     }
 
     // TODO: Only schedule nextReadEvent only when it has to be scheduled
-    if ((!nextReadEvent.scheduled()) &&
-        (!updateQueue.empty())) {
+    if ((!nextReadEvent.scheduled()) && (!updateQueue.empty())) {
         schedule(nextReadEvent, nextCycle());
     }
 }
@@ -166,9 +172,14 @@ void
 WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 {
     assert(addrWorkListMap.size() <= onTheFlyUpdateMapSize);
+
     addrWorkListMap[addr] = wl;
-    // TODO: Add checks to see if scheduling is necessary or correct.
-    if ((!nextReduceEvent.scheduled()) && (!addrWorkListMap.empty())) {
+    DPRINTF(MPU, "%s: Received a WorkListItem from the coalesceEngine. Adding"
+                " it to the addrWorkListMap. addrWorkListMap[%lu] = %s.\n",
+                __func__, addr, wl.to_string());
+
+    assert(!addrWorkListMap.empty());
+    if (!nextReduceEvent.scheduled()) {
         schedule(nextReduceEvent, nextCycle());
     }
 }
@@ -182,25 +193,32 @@ WLEngine::processNextReduceEvent()
     std::vector<Addr> servicedAddresses;
     while (it != addrWorkListMap.end()) {
         Addr addr = it->first;
-        WorkListItem wl = it->second;
         uint32_t update_value = onTheFlyUpdateMap[addr];
-        DPRINTF(MPU, "%s: updating WorkList[%lu] with the current temp_prop: "
-                    "%d, with new update: %d.\n", __func__, addr, wl.tempProp,
-                    onTheFlyUpdateMap[addr]);
+        DPRINTF(MPU, "%s: Reducing between onTheFlyUpdateMap and "
+                    "addrWorkListMap values. onTheFlyUpdateMap[%lu] = %u, "
+                    "addrWorkListMap[%lu] = %s.\n", __func__,
+                                addr, onTheFlyUpdateMap[addr],
+                                addr, addrWorkListMap[addr].to_string());
         // TODO: Generalize this to reduce function rather than just min
-        wl.tempProp = std::min(update_value, wl.tempProp);
+        addrWorkListMap[addr].tempProp =
+                    std::min(update_value, addrWorkListMap[addr].tempProp);
+        DPRINTF(MPU, "%s: Reduction done. addrWorkListMap[%lu] = %s.\n",
+                    __func__, addr, addrWorkListMap[addr].to_string());
         stats.numReduce++;
 
-        coalesceEngine->recvWLWrite(addr, wl);
+        coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]);
         servicedAddresses.push_back(addr);
+        DPRINTF(MPU, "%s: Added addr: %lu to servicedAdresses.\n",
+                    __func__, addr);
         it++;
     }
 
     addrWorkListMap.clear();
     for (int i = 0; i < servicedAddresses.size(); i++) {
         onTheFlyUpdateMap.erase(servicedAddresses[i]);
+        DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
+                    __func__, servicedAddresses[i]);
     }
-    DPRINTF(MPU, "%s: onTheFlyUpdateMap.size(): %u, servicedAddresses.size(): %u.\n", __func__, onTheFlyUpdateMap.size(), servicedAddresses.size());
 }
 
 bool
@@ -212,9 +230,10 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     }
 
     updateQueue.push_back(pkt);
-
+    DPRINTF(MPU, "%s: Pushed an item to the front of updateQueue"
+                                        ". updateQueue.size = %u.\n",
+                                        __func__, updateQueue.size());
     assert(!updateQueue.empty());
-    DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
     if (!nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());
     }

From 8195339d419b32284f92f4c14395efc58a245604 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 7 Apr 2022 15:06:58 -0700
Subject: [PATCH 083/247] Restructing inheritance and fixiing inf queue.

---
 configs/accl/sega.py                    |   6 +-
 src/accl/graph/TODO.md                  |   1 +
 src/accl/graph/base/BaseReadEngine.py   |   3 +
 src/accl/graph/base/base_read_engine.cc |  83 ++++++++
 src/accl/graph/base/base_read_engine.hh |  18 +-
 src/accl/graph/base/data_structs.hh     |   8 +-
 src/accl/graph/sega/CoalesceEngine.py   |   3 +-
 src/accl/graph/sega/PushEngine.py       |   1 -
 src/accl/graph/sega/coalesce_engine.cc  | 254 ++++++++++++------------
 src/accl/graph/sega/coalesce_engine.hh  |  16 +-
 src/accl/graph/sega/push_engine.cc      | 142 ++++++-------
 src/accl/graph/sega/push_engine.hh      |  55 +++--
 src/accl/graph/sega/wl_engine.cc        |  10 +-
 src/accl/graph/sega/wl_engine.hh        |   2 +-
 14 files changed, 348 insertions(+), 254 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index a0c7766fe0..8e24280366 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -2,9 +2,9 @@
 from m5.objects import *
 
 class MPU(SubSystem):
-    def __init__(self):
+    def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
-        self.push_engine = PushEngine(base_edge_addr=0x80000000,
+        self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
                                     push_req_queue_size=16,
                                     mem_resp_queue_size=8)
         self.coalesce_engine = CoalesceEngine(
@@ -58,7 +58,7 @@ def __init__(self):
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
 
-        self.mpu = MPU()
+        self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
             vertex_binary="facebook/graph_binaries/vertices",
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index d5effbeb96..a0e2cefeff 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -6,3 +6,4 @@ and memory atom size in the coalesce engine
 * look at all the simobjects and come up with a general architecture. Make
 sure all the simobjects follow that architecture.
 * implement all the communications between simobjects as req/retry.
+* get rid of maps with RequestPtr as keys
diff --git a/src/accl/graph/base/BaseReadEngine.py b/src/accl/graph/base/BaseReadEngine.py
index 84c53465b9..3ddab2d3c4 100644
--- a/src/accl/graph/base/BaseReadEngine.py
+++ b/src/accl/graph/base/BaseReadEngine.py
@@ -37,3 +37,6 @@ class BaseReadEngine(ClockedObject):
 
     system = Param.System(Parent.any, 'System this Engine is a part of')
     mem_port  = RequestPort("Port to communicate with the memory")
+
+    outstanding_mem_req_queue_size = Param.Int(16, "Capacity of queue in "
+                                    "which memory requests are queued.")
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index a32237db35..e3b588cfc6 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -28,6 +28,7 @@
 
 #include "accl/graph/base/base_read_engine.hh"
 
+#include "debug/MPU.hh"
 namespace gem5
 {
 
@@ -35,6 +36,10 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
+    outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
+    alarmRequested(false),
+    spaceRequested(0),
+    nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     _requestorId(system->getRequestorId(this))
 {}
 
@@ -83,6 +88,31 @@ BaseReadEngine::MemPort::recvReqRetry()
     }
 }
 
+void
+BaseReadEngine::processNextMemReqEvent()
+{
+    if (memPort.blocked()) {
+        return;
+    }
+
+    // TODO: Maybe add a DPRINTF here.
+    PacketPtr pkt = outstandingMemReqQueue.front();
+    memPort.sendPacket(pkt);
+    outstandingMemReqQueue.pop_front();
+
+    if (alarmRequested &&
+        (outstandingMemReqQueue.size() <=
+        (outstandingMemReqQueueSize - spaceRequested))) {
+        alarmRequested = false;
+        spaceRequested = 0;
+        respondToAlarm();
+    }
+
+    if ((!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
+        schedule(nextMemReqEvent, nextCycle());
+    }
+}
+
 PacketPtr
 BaseReadEngine::createReadPacket(Addr addr, unsigned int size)
 {
@@ -98,4 +128,57 @@ BaseReadEngine::createReadPacket(Addr addr, unsigned int size)
     return pkt;
 }
 
+PacketPtr
+BaseReadEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
+bool
+BaseReadEngine::memReqQueueHasSpace(int space)
+{
+    assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
+    return (
+        outstandingMemReqQueue.size() <= (outstandingMemReqQueueSize - space)
+        );
+}
+
+bool
+BaseReadEngine::memReqQueueFull()
+{
+    assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
+    return (outstandingMemReqQueue.size() == outstandingMemReqQueueSize);
+}
+
+void
+BaseReadEngine::enqueueMemReq(PacketPtr pkt)
+{
+    panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n");
+    outstandingMemReqQueue.push_back(pkt);
+
+    assert(!outstandingMemReqQueue.empty());
+    if (!nextMemReqEvent.scheduled()) {
+        schedule(nextMemReqEvent, nextCycle());
+    }
+}
+
+void
+BaseReadEngine::requestAlarm(int space) {
+    panic_if((alarmRequested == true) || (spaceRequested != 0),
+            "You should not request another alarm without the first one being"
+            "responded to.\n");
+    alarmRequested = true;
+    spaceRequested = space;
+}
+
 }
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index e21aaa01d2..bec922beef 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -68,16 +68,30 @@ class BaseReadEngine : public ClockedObject
     System* system;
     MemPort memPort;
 
+    int outstandingMemReqQueueSize;
+    bool alarmRequested;
+    int spaceRequested;
+    std::deque<PacketPtr> outstandingMemReqQueue;
+
+    EventFunctionWrapper nextMemReqEvent;
+    void processNextMemReqEvent();
+
   protected:
     const RequestorID _requestorId;
 
-    bool memPortBlocked() { return memPort.blocked(); }
-    void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
     void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
 
+    bool memReqQueueHasSpace(int space);
+    bool memReqQueueFull();
+    void enqueueMemReq(PacketPtr pkt);
+    bool pendingAlarm() { return alarmRequested; }
+    void requestAlarm(int space);
+
+    virtual void respondToAlarm() = 0;
     virtual bool handleMemResp(PacketPtr pkt) = 0;
 
     PacketPtr createReadPacket(Addr addr, unsigned int size);
+    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
 
   public:
     PARAMS(BaseReadEngine);
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index dacb74e38c..28a503528f 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -70,10 +70,10 @@ struct __attribute__ ((packed)) Edge
     uint16_t weight : 16;
     uint64_t neighbor : 48;
 
-    // std::string to_string()
-    // {
-    //     return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor);
-    // }
+    std::string to_string()
+    {
+        return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor);
+    }
 
     Edge(uint16_t weight, uint64_t neighbor):
         weight(weight),
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 0330da7576..bec7e3d233 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -33,8 +33,7 @@ class CoalesceEngine(BaseReadEngine):
     type = 'CoalesceEngine'
     cxx_header = "accl/graph/sega/coalesce_engine.hh"
     cxx_class = 'gem5::CoalesceEngine'
-    
+
     peer_push_engine = Param.PushEngine(NULL, "")
     num_mshr_entry = Param.Int(4, "")
     num_tgts_per_mshr = Param.Int(20, "")
-    outstanding_mem_req_queue_size = Param.Int(20, "")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 129d9454c7..645bc5f4ea 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -36,5 +36,4 @@ class PushEngine(BaseReadEngine):
 
     req_port  = RequestPort("Port to send updates to the outside")
     base_edge_addr = Param.Addr("")
-    mem_resp_queue_size = Param.Int(0, "")
     push_req_queue_size = Param.Int(0, "")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index d7fa806fff..015629245b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -40,10 +40,6 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     peerPushEngine(params.peer_push_engine),
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
-    outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
-    alarmRequested(false),
-    spaceRequested(0),
-    nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
     stats(*this)
@@ -85,14 +81,11 @@ CoalesceEngine::recvReadAddr(Addr addr)
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
-        // TODO: Make addrQueue and wlQueue into one std::pair<Addr, WL>
-        addrResponseQueue.push_back(addr);
-        worklistResponseQueue.push_back(
-            cacheBlocks[block_index].items[wl_offset]);
+        responseQueue.push_back(std::make_tuple(addr,
+                    cacheBlocks[block_index].items[wl_offset]));
         DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
-            "to worklistResponseQueue. worklistResponseQueue.size = %d.\n",
-            __func__, addr, block_index, wl_offset,
-            worklistResponseQueue.size(),
+            "to responseQueue. responseQueue.size = %d.\n",
+            __func__, addr, block_index, wl_offset, responseQueue.size(),
             cacheBlocks[block_index].items[wl_offset].to_string());
         // TODO: Use a bitset instead of unsigned int for takenMask
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
@@ -100,7 +93,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
         stats.readHits++;
         stats.numVertexReads++;
 
-        assert(!worklistResponseQueue.empty() && !addrResponseQueue.empty());
+        assert(!responseQueue.empty());
         if (!nextRespondEvent.scheduled()) {
             schedule(nextRespondEvent, nextCycle());
         }
@@ -136,21 +129,18 @@ CoalesceEngine::recvReadAddr(Addr addr)
                                 "line[%d]", __func__, addr, block_index);
                     return true;
                 } else {
-                    // TODO: Set valid to false every deallocation and
                     assert(!cacheBlocks[block_index].valid);
                     // MSHR available and no conflict
-                    assert(
-                        outstandingMemReqQueue.size() <=
-                        outstandingMemReqQueueSize);
+                    //TODO: Fix this to work with new inheritance.
+                    // assert(
+                    //     outstandingMemReqQueue.size() <=
+                    //     outstandingMemReqQueueSize);
                     DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to "
                                 "allocate a cache line for it.\n",
                                 __func__, addr);
-                    if (outstandingMemReqQueue.size() ==
-                        outstandingMemReqQueueSize) {
-                        DPRINTF(MPU, "%s: No space in outstandingMemReqQueue "
-                                    "(outstandingMemReqQueue.size: %u). "
-                                    "Rejecting  request.\n", __func__,
-                                    outstandingMemReqQueue.size());
+                    if (memReqQueueFull()) {
+                        DPRINTF(MPU, "%s: No space in outstandingMemReqQueue. "
+                                    "Rejecting  request.\n", __func__);
                         return false;
                     }
                     cacheBlocks[block_index].addr = aligned_addr;
@@ -169,17 +159,10 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
                                 " req addr (aligned_addr) = %lu, size = 64.\n",
                                 __func__, addr, aligned_addr);
-                    outstandingMemReqQueue.push_back(pkt);
-                    DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue. "
-                                "outstandingMemReqQueue.size = %d", __func__,
-                                outstandingMemReqQueue.size());
-
+                    enqueueMemReq(pkt);
+                    DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue.\n",
+                                                                    __func__);
                     stats.numVertexBlockReads++;
-
-                    assert(!outstandingMemReqQueue.empty());
-                    if (!nextMemReqEvent.scheduled()) {
-                        schedule(nextMemReqEvent, nextCycle());
-                    }
                     return true;
                 }
             }
@@ -207,65 +190,41 @@ CoalesceEngine::recvReadAddr(Addr addr)
     }
 }
 
-void
-CoalesceEngine::processNextMemReqEvent()
-{
-    PacketPtr pkt = outstandingMemReqQueue.front();
-
-    if (!memPortBlocked()) {
-        sendMemReq(pkt);
-        DPRINTF(MPU, "%s: Sent a memory request to Addr: %lu, MemCmd: %s.\n",
-                __func__, pkt->getAddr(), pkt->isRead() ? "Read" : "Write");
-        outstandingMemReqQueue.pop_front();
-        DPRINTF(MPU, "%s: Popped a packet from outstandingMemReqQueue. "
-                    "outstandingMemReqQueue.size = %u.\n", __func__,
-                    outstandingMemReqQueue.size());
-    }
-
-    if ((alarmRequested) &&
-        (outstandingMemReqQueue.size() <
-        (outstandingMemReqQueueSize - spaceRequested))) {
-        alarmRequested = false;
-        spaceRequested = 0;
-        schedule(nextApplyAndCommitEvent, nextCycle());
-        DPRINTF(MPU, "%s: There is an alarm request for "
-        "nextApplyAndCommitEvent. Reset alarm parameters and scheduled "
-        "nextApplyAndCommitEvent.\n", __func__);
-    }
-    if ((!nextMemReqEvent.scheduled()) &&
-        (!outstandingMemReqQueue.empty())) {
-        schedule(nextMemReqEvent, nextCycle());
-    }
-}
-
 void
 CoalesceEngine::processNextRespondEvent()
 {
-    Addr addr_response = addrResponseQueue.front();
-    WorkListItem worklist_response = worklistResponseQueue.front();
+    Addr addr_response;
+    WorkListItem worklist_response;
 
+    std::tie(addr_response, worklist_response) = responseQueue.front();
     peerWLEngine->handleIncomingWL(addr_response, worklist_response);
     DPRINTF(MPU, "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n",
                 __func__, worklist_response.to_string(), addr_response);
 
-    addrResponseQueue.pop_front();
-    worklistResponseQueue.pop_front();
+    responseQueue.pop_front();
     DPRINTF(MPU, "%s: Popped a response from worklistResponseQueue. "
                 "worklistResponseQueue.size = %d.\n", __func__,
-                worklistResponseQueue.size());
+                responseQueue.size());
 
     if ((!nextRespondEvent.scheduled()) &&
-        (!worklistResponseQueue.empty()) &&
-        (!addrResponseQueue.empty())) {
+        (!responseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
 }
 
+void
+CoalesceEngine::respondToAlarm()
+{
+    assert(!nextApplyAndCommitEvent.scheduled());
+    schedule(nextApplyAndCommitEvent, nextCycle());
+}
+
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
     assert(pkt->isResponse());
     if (pkt->isWrite()) {
+        delete pkt;
         DPRINTF(MPU, "%s: Received a write response for Addr: %lu. Dropping "
                     "the packet.\n", __func__, pkt->getAddr());
         return true;
@@ -291,6 +250,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 block_index, i, cacheBlocks[block_index].items[i].to_string());
     }
     cacheBlocks[block_index].valid = true;
+    delete pkt;
 
     int bias = 0;
     std::vector<int> servicedIndices;
@@ -303,13 +263,12 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
                         "be serviced with the received packet.\n",
                         __func__, miss_addr, block_index);
-            addrResponseQueue.push_back(miss_addr);
-            worklistResponseQueue.push_back(
-                cacheBlocks[block_index].items[wl_offset]);
+            responseQueue.push_back(std::make_tuple(miss_addr,
+                    cacheBlocks[block_index].items[wl_offset]));
             DPRINTF(MPU, "%s: Pushed cache line[%d][%d] to "
-                    "worklistResponseQueue. worklistResponseQueue.size = %u.\n"
+                    "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
-                    worklistResponseQueue.size());
+                    responseQueue.size());
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
             stats.numVertexReads++;
             servicedIndices.push_back(i);
@@ -336,8 +295,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     }
 
     if ((!nextRespondEvent.scheduled()) &&
-        (!worklistResponseQueue.empty()) &&
-        (!addrResponseQueue.empty())) {
+        (!responseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
 
@@ -363,7 +321,8 @@ CoalesceEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
-    Addr aligned_addr = (addr / 64) * 64;
+    // TODO: Parameterize all the numbers here.
+    Addr aligned_addr = std::floor(addr / 64) * 64;
     int block_index = (aligned_addr / 64) % 256;
     int wl_offset = (addr - aligned_addr) / 16;
 
@@ -371,6 +330,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 __func__, wl.to_string(), addr);
     assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) ==
             (1 << wl_offset));
+
+    if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) {
+        cacheBlocks[block_index].hasChange = true;
+    }
+
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
     stats.numVertexWrites++;
@@ -378,7 +342,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
-    // && (cacheBlocks[block_index].hasConflict)
+    // TODO: Later on check (cacheBlocks[block_index].hasConflict) to add
+    // to evictQueue.
     if ((cacheBlocks[block_index].takenMask == 0)) {
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
@@ -389,8 +354,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     }
 
     if ((!nextApplyAndCommitEvent.scheduled()) &&
-        (!evictQueue.empty())&&
-        ((!alarmRequested) && (spaceRequested == 0))) {
+        (!evictQueue.empty()) &&
+        (pendingAlarm())) {
         schedule(nextApplyAndCommitEvent, nextCycle());
     }
 
@@ -399,36 +364,45 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextApplyAndCommitEvent()
 {
-    assert((!alarmRequested) && (spaceRequested == 0));
+    // FIXME: Refactor the line below to work with the new inheritance.
+    // assert((!alarmRequested) && (spaceRequested == 0));
     int block_index = evictQueue.front();
     uint8_t changedMask = 0;
-    // TODO: parameterize 64 to memory atom size
-    uint8_t* wl_data;
-    uint8_t data[64];
 
     DPRINTF(MPU, "%s: Received nextApplyAndCommitEvent for cache line[%d].\n",
                 __func__, block_index);
     DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and "
                 "then commited.\n", __func__, block_index);
 
-    if ((cacheBlocks[block_index].hasConflict) &&
-        (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)) {
+    if ((cacheBlocks[block_index].hasChange)&&
+        (cacheBlocks[block_index].hasConflict) &&
+        (memReqQueueHasSpace(2))) {
         DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
                     __func__, block_index);
-
-    } else if ((!cacheBlocks[block_index].hasConflict) &&
-            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
+    } else if ((cacheBlocks[block_index].hasChange) &&
+                (!cacheBlocks[block_index].hasConflict) &&
+                (memReqQueueHasSpace(1))) {
+        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                    __func__, block_index);
+    } else if ((!cacheBlocks[block_index].hasChange) &&
+                (cacheBlocks[block_index].hasConflict) &&
+                (memReqQueueHasSpace(1))) {
         DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
                     __func__, block_index);
+    } else if ((!cacheBlocks[block_index].hasChange) &&
+                (!cacheBlocks[block_index].hasConflict)) {
+        DPRINTF(MPU, "%s: No ApplyAndCommit needed for cache line[%d].\n",
+                    __func__, block_index);
     } else {
-        alarmRequested = true;
-        spaceRequested = cacheBlocks[block_index].hasConflict ? 2 : 1;
+        int spaceNeeded = cacheBlocks[block_index].hasConflict ? 2 : 1;
+        requestAlarm(spaceNeeded);
         DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set "
-        "an alarm for nextApplyAndCommitEvent when space = %d.\n",
-        __func__, spaceRequested);
+        "an alarm for nextApplyAndCommitEvent when there is %d space.\n",
+        __func__, spaceNeeded);
         return;
     }
 
+    // Reducing between tempProp and prop for each item in the cache line.
     for (int i = 0; i < 4; i++) {
         uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
         cacheBlocks[block_index].items[i].prop = std::min(
@@ -442,23 +416,18 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
                         __func__, block_index, i);
         }
-        wl_data = (uint8_t*) (cacheBlocks[block_index].items + i);
-        std::memcpy(data + (i * sizeof(WorkListItem)),
-                    wl_data, sizeof(WorkListItem));
     }
 
-    if (changedMask) {
+    if (cacheBlocks[block_index].hasChange) {
         DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
                     , __func__, block_index);
-        assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
+        // TODO: Parameterize this 64 to memory atom size
         PacketPtr write_pkt = createWritePacket(
-            cacheBlocks[block_index].addr, 64, data);
+            cacheBlocks[block_index].addr, 64,
+            (uint8_t*) cacheBlocks[block_index].items);
         DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n",
                     __func__, write_pkt->getAddr());
         if (cacheBlocks[block_index].hasConflict) {
-            assert(
-                outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1
-            );
             DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
                         "enough space in outstandingMemReqQueue for the write "
                         "back packet and its subsequent read packet.\n",
@@ -467,18 +436,19 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
                         " Addr: %lu.\n", __func__, block_index, miss_addr);
             // TODO: parameterize 64
-            Addr aligned_miss_addr = (miss_addr / 64) * 64;
+            Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
             PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
             DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
                         " req addr (aligned_addr) = %lu, size = 64.\n",
                         __func__, miss_addr, aligned_miss_addr);
-            outstandingMemReqQueue.push_back(write_pkt);
-            outstandingMemReqQueue.push_back(read_pkt);
+
+            enqueueMemReq(write_pkt);
+            stats.numVertexBlockWrites++;
+            enqueueMemReq(read_pkt);
             DPRINTF(MPU, "%s: Added the evicting write back packet along with "
                         "its subsequent read packet (to service the conflicts)"
-                        " to outstandingMemReqQueue. "
-                        "outstandingMemReqQueue.size = %u.\n", __func__,
-                        outstandingMemReqQueue.size());
+                        " to outstandingMemReqQueue.\n" , __func__);
+
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
@@ -500,22 +470,25 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
                             __func__, block_index, 3);
             }
+            // TODO: This should be improved
+
+            cacheBlocks[block_index].addr = aligned_miss_addr;
             cacheBlocks[block_index].takenMask = 0;
             cacheBlocks[block_index].allocated = true;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = true;
-            evictQueue.pop_front();
+            cacheBlocks[block_index].hasChange = false;
             DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
                         " = %u.\n", __func__, evictQueue.size());
         } else {
-            assert(outstandingMemReqQueue.size() < outstandingMemReqQueueSize);
             DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
                     "enough space in outstandingMemReqQueue for the write back"
                     " packet.\n", __func__, block_index);
-            outstandingMemReqQueue.push_back(write_pkt);
+            enqueueMemReq(write_pkt);
+            stats.numVertexBlockWrites++;
             DPRINTF(MPU, "%s: Added the write back packet to "
-                        "outstandingMemReqQueue. oustandingMemReqQueue.size = "
-                        "%u.\n", __func__, outstandingMemReqQueue.size());
+                        "outstandingMemReqQueue.\n", __func__);
+
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
@@ -537,33 +510,58 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
                             __func__, block_index, 3);
             }
+
+            // Since allocated is false, does not matter what the address is.
             cacheBlocks[block_index].takenMask = 0;
             cacheBlocks[block_index].allocated = false;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = false;
-            evictQueue.pop_front();
+            cacheBlocks[block_index].hasChange = false;
             DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
                         " = %u.\n", __func__, evictQueue.size());
         }
     } else {
-        cacheBlocks[block_index].takenMask = 0;
-        cacheBlocks[block_index].allocated = false;
-        cacheBlocks[block_index].valid = false;
-        cacheBlocks[block_index].hasConflict = false;
         DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write "
-                    "backs are necessary. Deallocated cache line[%d].\n",
+                    "backs are necessary.\n",
                     __func__, block_index, block_index);
-        evictQueue.pop_front();
-        DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                        " = %u.\n", __func__, evictQueue.size());
-    }
+        if (cacheBlocks[block_index].hasConflict) {
+            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
+                        "enough space in outstandingMemReqQueue for the write "
+                        "back packet and its subsequent read packet.\n",
+                        __func__, block_index);
+            Addr miss_addr = MSHRMap[block_index][0];
+            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
+                        " Addr: %lu.\n", __func__, block_index, miss_addr);
+            // TODO: parameterize 64
+            Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
+            PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                        " req addr (aligned_addr) = %lu, size = 64.\n",
+                        __func__, miss_addr, aligned_miss_addr);
+            enqueueMemReq(read_pkt);
+
+            cacheBlocks[block_index].addr = aligned_miss_addr;
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = true;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = true;
+            cacheBlocks[block_index].hasChange = false;
+        } else {
+            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just "
+                        "deallocating the line.\n", __func__, block_index);
 
-    if ((!nextMemReqEvent.scheduled()) &&
-        (!outstandingMemReqQueue.empty())) {
-        stats.numVertexBlockWrites++;
-        schedule(nextMemReqEvent, nextCycle());
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = false;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = false;
+            cacheBlocks[block_index].hasChange = false;
+        }
     }
 
+    evictQueue.pop_front();
+    DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                        " = %u.\n", __func__, evictQueue.size());
+
     if ((!nextApplyAndCommitEvent.scheduled()) &&
         (!evictQueue.empty())) {
         schedule(nextApplyAndCommitEvent, nextCycle());
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 902a960301..6a8aadcbae 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -53,6 +53,7 @@ class CoalesceEngine : public BaseReadEngine
         bool allocated;
         bool valid;
         bool hasConflict;
+        bool hasChange;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block():
@@ -60,7 +61,8 @@ class CoalesceEngine : public BaseReadEngine
           takenMask(0),
           allocated(false),
           valid(false),
-          hasConflict(false)
+          hasConflict(false),
+          hasChange(false)
         {}
     };
 
@@ -73,13 +75,7 @@ class CoalesceEngine : public BaseReadEngine
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHRMap;
 
-    int outstandingMemReqQueueSize;
-    bool alarmRequested;
-    int spaceRequested;
-    std::deque<PacketPtr> outstandingMemReqQueue;
-
-    std::deque<Addr> addrResponseQueue;
-    std::deque<WorkListItem> worklistResponseQueue;
+    std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     std::deque<int> evictQueue;
 
@@ -88,9 +84,6 @@ class CoalesceEngine : public BaseReadEngine
     PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
     // PacketPtr createWritePacket(Addr addr, unsigned int size, WorkListItem wl);
 
-    EventFunctionWrapper nextMemReqEvent;
-    void processNextMemReqEvent();
-
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
 
@@ -115,6 +108,7 @@ class CoalesceEngine : public BaseReadEngine
     CoalesceStats stats;
 
   protected:
+    virtual void respondToAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c9ed781d79..86418ac76e 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -39,10 +39,7 @@ PushEngine::PushEngine(const PushEngineParams &params):
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
-    memRespQueueSize(params.mem_resp_queue_size),
-    onTheFlyReadReqs(0),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
-    nextReadEvent([this] { processNextReadEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name())
 {}
 
@@ -66,12 +63,13 @@ PushEngine::startup()
     *tempPtr = 0;
 
     // PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
-    PacketPtr first_update = createUpdatePacket(0, 4, (uint32_t) 0);
+    PacketPtr first_update = createUpdatePacket<uint32_t>(0, (uint32_t) 0);
 
-    sendPushUpdate(first_update);
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(first_update);
+    }
 }
 
-
 void
 PushEngine::ReqPort::sendPacket(PacketPtr pkt)
 {
@@ -108,19 +106,21 @@ PushEngine::ReqPort::recvReqRetry()
 bool
 PushEngine::recvWLItem(WorkListItem wl)
 {
-    assert(pushReqQueue.size() <= pushReqQueueSize);
+    assert((pushReqQueueSize == 0) ||
+        (pushReqQueue.size() <= pushReqQueueSize));
     if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
         return false;
     }
 
     Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
-    uint32_t update_value = wl.prop;
-    pushReqQueue.push_back(
-        std::make_pair(std::make_pair(start_addr, end_addr), update_value));
+    uint32_t value = wl.prop;
 
-    if ((!nextAddrGenEvent.scheduled()) &&
-        (!pushReqQueue.empty())) {
+    // TODO: parameterize 64 to memory atom size
+    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), 64, value);
+
+    assert(!pushReqQueue.empty());
+    if (!nextAddrGenEvent.scheduled()) {
         schedule(nextAddrGenEvent, nextCycle());
     }
     return true;
@@ -129,65 +129,44 @@ PushEngine::recvWLItem(WorkListItem wl)
 void
 PushEngine::processNextAddrGenEvent()
 {
-    Addr start_addr, end_addr;
-    uint32_t update_value;
 
-    std::pair<std::pair<Addr, Addr>, uint32_t> front = pushReqQueue.front();
-    std::tie(start_addr, end_addr) = front.first;
-    update_value = front.second;
+    Addr aligned_addr, offset;
+    int num_edges;
 
-    Addr req_addr = (start_addr / 64) * 64;
-    Addr req_offset = start_addr % 64;
-    int num_edges = 0;
+    PushPacketInfoGen curr_info = pushReqQueue.front();
+    std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
 
-    if (end_addr > req_addr + 64) {
-        num_edges = (req_addr + 64 - start_addr) / sizeof(Edge);
-    } else {
-        num_edges = (end_addr - start_addr) / sizeof(Edge);
-    }
-    PacketPtr pkt = createReadPacket(req_addr, 64);
-    reqOffsetMap[pkt->req] = req_offset;
+    PacketPtr pkt = createReadPacket(aligned_addr, 64);
+    reqOffsetMap[pkt->req] = offset;
     reqNumEdgeMap[pkt->req] = num_edges;
-    reqValueMap[pkt->req] = update_value;
-    pendingReadReqs.push_back(pkt);
+    reqValueMap[pkt->req] = curr_info.value();
 
-    pushReqQueue.pop_front();
+    enqueueMemReq(pkt);
 
-    if (req_addr + 64 < end_addr) {
-        pushReqQueue.push_front(
-        std::make_pair(std::make_pair(req_addr + 64, end_addr), update_value)
-        );
+    if (curr_info.done()) {
+        pushReqQueue.pop_front();
     }
 
-    if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
-        schedule(nextAddrGenEvent, nextCycle());
+    if ((memReqQueueFull()) && (!pushReqQueue.empty())) {
+        requestAlarm(1);
+        return;
     }
 
-    if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) {
-        schedule(nextReadEvent, nextCycle());
+    if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
+        schedule(nextAddrGenEvent, nextCycle());
     }
 }
 
 void
-PushEngine::processNextReadEvent()
+PushEngine::respondToAlarm()
 {
-    if (((memRespQueue.size() + onTheFlyReadReqs) <= memRespQueueSize) &&
-        (!memPortBlocked())) {
-        PacketPtr pkt = pendingReadReqs.front();
-        sendMemReq(pkt);
-        onTheFlyReadReqs++;
-        pendingReadReqs.pop_front();
-    }
-
-    if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) {
-        schedule(nextReadEvent, nextCycle());
-    }
+    assert(!nextAddrGenEvent.scheduled());
+    schedule(nextAddrGenEvent, nextCycle());
 }
 
 bool
 PushEngine::handleMemResp(PacketPtr pkt)
 {
-    onTheFlyReadReqs--;
     memRespQueue.push_back(pkt);
 
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
@@ -201,39 +180,42 @@ void
 PushEngine::processNextPushEvent()
 {
     PacketPtr pkt = memRespQueue.front();
-    RequestPtr req = pkt->req;
-    uint8_t *data = pkt->getPtr<uint8_t>();
+    uint8_t* data = pkt->getPtr<uint8_t>();
 
-    Addr offset = reqOffsetMap[req];
-    uint32_t value = reqValueMap[req];
+    Addr offset = reqOffsetMap[pkt->req];
+    assert(offset < 64);
+    uint32_t value = reqValueMap[pkt->req];
 
     DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
                 "offset: %lu\n",
             __func__, pkt->getAddr(), offset);
 
-    Edge* e = (Edge*) (data + offset);
-    // DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
+    Edge* curr_edge = (Edge*) (data + offset);
 
     // TODO: Implement propagate function here
     uint32_t update_value = value + 1;
     DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
-            __func__, e->neighbor, update_value);
+            __func__, curr_edge->neighbor, update_value);
 
-    PacketPtr update = createUpdatePacket(e->neighbor,
-                        sizeof(uint32_t), update_value);
+    PacketPtr update = createUpdatePacket<uint32_t>(
+                            curr_edge->neighbor, update_value);
 
-    if (sendPushUpdate(update)) {
+    if (!reqPort.blocked()) {
         DPRINTF(MPU, "%s: Send a push update to addr: %lu with value: %d.\n",
-                    __func__, e->neighbor, update_value);
-        reqOffsetMap[req] = reqOffsetMap[req] + sizeof(Edge);
-        reqNumEdgeMap[req]--;
-    }
-
-    if (reqNumEdgeMap[req] == 0) {
+                                __func__, curr_edge->neighbor, update_value);
+        reqPort.sendPacket(update);
+        reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
+        assert(reqOffsetMap[pkt->req] <= 64);
+        reqNumEdgeMap[pkt->req]--;
+        assert(reqNumEdgeMap[pkt->req] >= 0);
+    }
+
+    if (reqNumEdgeMap[pkt->req] == 0) {
+        reqOffsetMap.erase(pkt->req);
+        reqNumEdgeMap.erase(pkt->req);
+        reqValueMap.erase(pkt->req);
+        delete pkt;
         memRespQueue.pop_front();
-        reqOffsetMap.erase(req);
-        reqNumEdgeMap.erase(req);
-        reqValueMap.erase(req);
     }
 
     if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
@@ -241,11 +223,11 @@ PushEngine::processNextPushEvent()
     }
 }
 
-PacketPtr
-// PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data)
-PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value)
+template<typename T> PacketPtr
+PushEngine::createUpdatePacket(Addr addr, T value)
 {
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+    RequestPtr req = std::make_shared<Request>(
+                addr, sizeof(T), 0, _requestorId);
     // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
     // bits
     req->setPC(((Addr) _requestorId) << 2);
@@ -255,19 +237,9 @@ PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value)
 
     pkt->allocate();
     // pkt->setData(data);
-    pkt->setLE<uint32_t>(value);
+    pkt->setLE<T>(value);
 
     return pkt;
 }
 
-bool
-PushEngine::sendPushUpdate(PacketPtr pkt)
-{
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(pkt);
-        return true;
-    }
-    return false;
-}
-
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index a539079ede..2aba0ca008 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -39,6 +39,42 @@ namespace gem5
 class PushEngine : public BaseReadEngine
 {
   private:
+    class PushPacketInfoGen {
+      private:
+        Addr _start;
+        Addr _end;
+        size_t _step;
+        size_t _atom;
+        uint32_t _value;
+
+      public:
+        PushPacketInfoGen(Addr start, Addr end, size_t step,
+                            size_t atom, uint32_t value):
+                        _start(start), _end(end), _step(step),
+                        _atom(atom), _value(value)
+        {}
+
+        std::tuple<Addr, Addr, int> nextReadPacketInfo()
+        {
+            panic_if(done(), "Should not call nextPacketInfo when done.\n");
+            Addr aligned_addr = std::floor(_start / _atom) * _atom;
+            Addr offset = _start - aligned_addr;
+            int num_items = 0;
+
+            if (_end > (_start + _atom)) {
+                num_items = (_atom - offset) / _step;
+            } else {
+                num_items = (_end - _start) / _step;
+            }
+            _start = aligned_addr + _atom;
+
+            return std::make_tuple(aligned_addr, offset, num_items);
+        }
+
+        uint32_t value() { return _value; }
+        bool done() { return (_start >= _end); }
+    };
+
     class ReqPort : public RequestPort
     {
       private:
@@ -64,37 +100,30 @@ class PushEngine : public BaseReadEngine
     Addr baseEdgeAddr;
 
     int pushReqQueueSize;
-    std::deque<std::pair<std::pair<Addr, Addr>, uint32_t>> pushReqQueue;
+    std::deque<PushPacketInfoGen> pushReqQueue;
 
     // TODO: Add size one size for all these maps
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
-    // TODO: Possibility of infinite queueing
-    std::deque<PacketPtr> pendingReadReqs;
-
-    int memRespQueueSize;
-    int onTheFlyReadReqs;
+    // Since the push engine can process incoming packets faster than
+    // memory can send those packets, the size of this queue will
+    // always be limited by the b/w of the memory.
     std::deque<PacketPtr> memRespQueue;
 
     virtual void startup();
 
-    // PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t* data);
-    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint32_t value);
-
-    bool sendPushUpdate(PacketPtr pkt);
+    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     EventFunctionWrapper nextAddrGenEvent;
     void processNextAddrGenEvent();
 
-    EventFunctionWrapper nextReadEvent;
-    void processNextReadEvent();
-
     EventFunctionWrapper nextPushEvent;
     void processNextPushEvent();
 
   protected:
+    virtual void respondToAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index ea45cae652..cca945ce0a 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -118,9 +118,10 @@ WLEngine::getAddrRanges() const
 void
 WLEngine::processNextReadEvent()
 {
-    PacketPtr update = updateQueue.front();
-    Addr update_addr = update->getAddr();
-    uint32_t update_value = update->getLE<uint32_t>();
+    Addr update_addr;
+    uint32_t update_value;
+    std::tie(update_addr, update_value) = updateQueue.front();
+
     DPRINTF(MPU, "%s: Looking at the front of the updateQueue. Addr: %lu, "
                 "value: %u.\n", __func__, update_addr, update_value);
 
@@ -229,10 +230,11 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
         return false;
     }
 
-    updateQueue.push_back(pkt);
+    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
     DPRINTF(MPU, "%s: Pushed an item to the front of updateQueue"
                                         ". updateQueue.size = %u.\n",
                                         __func__, updateQueue.size());
+    delete pkt;
     assert(!updateQueue.empty());
     if (!nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 476c9be932..12df93ee79 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -71,7 +71,7 @@ class WLEngine : public BaseReduceEngine
     CoalesceEngine* coalesceEngine;
 
     int updateQueueSize;
-    std::deque<PacketPtr> updateQueue;
+    std::deque<std::tuple<Addr, uint32_t>> updateQueue;
 
     int onTheFlyUpdateMapSize;
     std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;

From 02f7baf9938e2a9b30ea3d9b44140862160b5aba Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 8 Apr 2022 13:13:37 -0700
Subject: [PATCH 084/247] Fixing one scheduling error in events.

---
 configs/accl/sega.py                    |  7 +++----
 src/accl/graph/base/base_read_engine.cc | 12 ++++++++++++
 src/accl/graph/base/base_read_engine.hh |  2 ++
 src/accl/graph/sega/coalesce_engine.cc  |  8 ++++----
 src/accl/graph/sega/push_engine.cc      | 23 +++++++++++++++++------
 src/accl/graph/sega/wl_engine.cc        |  4 ++--
 6 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8e24280366..e45580dd37 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -5,13 +5,12 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=16,
-                                    mem_resp_queue_size=8)
+                                    push_req_queue_size=16)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                    update_queue_size=16,
-                                    on_the_fly_update_map_size=8)
+                                update_queue_size=16,
+                                on_the_fly_update_map_size=8)
         self.interconnect = SystemXBar()
 
         self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index e3b588cfc6..1658d85627 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -86,6 +86,8 @@ BaseReadEngine::MemPort::recvReqRetry()
     if (!blocked()) {
         blockedPacket = nullptr;
     }
+
+    owner->wakeUp();
 }
 
 void
@@ -177,8 +179,18 @@ BaseReadEngine::requestAlarm(int space) {
     panic_if((alarmRequested == true) || (spaceRequested != 0),
             "You should not request another alarm without the first one being"
             "responded to.\n");
+    DPRINTF(MPU, "%s: Alarm requested with space = %d.\n", __func__, space);
     alarmRequested = true;
     spaceRequested = space;
 }
 
+void
+BaseReadEngine::wakeUp()
+{
+    if ((!nextMemReqEvent.scheduled()) &&
+        (!outstandingMemReqQueue.empty())) {
+        schedule(nextMemReqEvent, nextCycle());
+    }
+}
+
 }
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index bec922beef..5275f86449 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -108,6 +108,8 @@ class BaseReadEngine : public ClockedObject
 
     void recvFunctional(PacketPtr pkt);
 
+    void wakeUp();
+
 };
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 015629245b..c740597a2c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -202,8 +202,8 @@ CoalesceEngine::processNextRespondEvent()
                 __func__, worklist_response.to_string(), addr_response);
 
     responseQueue.pop_front();
-    DPRINTF(MPU, "%s: Popped a response from worklistResponseQueue. "
-                "worklistResponseQueue.size = %d.\n", __func__,
+    DPRINTF(MPU, "%s: Popped a response from responseQueue. "
+                "responseQueue.size = %d.\n", __func__,
                 responseQueue.size());
 
     if ((!nextRespondEvent.scheduled()) &&
@@ -338,7 +338,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
     stats.numVertexWrites++;
-    DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__,
+    DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
@@ -355,7 +355,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     if ((!nextApplyAndCommitEvent.scheduled()) &&
         (!evictQueue.empty()) &&
-        (pendingAlarm())) {
+        (!pendingAlarm())) {
         schedule(nextApplyAndCommitEvent, nextCycle());
     }
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 86418ac76e..3c1a98c69a 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -120,7 +120,8 @@ PushEngine::recvWLItem(WorkListItem wl)
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), 64, value);
 
     assert(!pushReqQueue.empty());
-    if (!nextAddrGenEvent.scheduled()) {
+    if ((!nextAddrGenEvent.scheduled()) &&
+        (!memReqQueueFull())) {
         schedule(nextAddrGenEvent, nextCycle());
     }
     return true;
@@ -133,8 +134,11 @@ PushEngine::processNextAddrGenEvent()
     Addr aligned_addr, offset;
     int num_edges;
 
-    PushPacketInfoGen curr_info = pushReqQueue.front();
+    PushPacketInfoGen &curr_info = pushReqQueue.front();
     std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
+    DPRINTF(MPU, "%s: Current packet information generated by "
+                "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
+                "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
     PacketPtr pkt = createReadPacket(aligned_addr, 64);
     reqOffsetMap[pkt->req] = offset;
@@ -144,11 +148,17 @@ PushEngine::processNextAddrGenEvent()
     enqueueMemReq(pkt);
 
     if (curr_info.done()) {
+        DPRINTF(MPU, "%s: Current PushPacketInfoGen is done.\n", __func__);
         pushReqQueue.pop_front();
+        DPRINTF(MPU, "%s: Popped curr_info from pushReqQueue. "
+                    "pushReqQueue.size() = %u.\n",
+                    __func__, pushReqQueue.size());
     }
 
-    if ((memReqQueueFull()) && (!pushReqQueue.empty())) {
-        requestAlarm(1);
+    if (memReqQueueFull()) {
+        if (!pushReqQueue.empty()) {
+            requestAlarm(1);
+        }
         return;
     }
 
@@ -162,6 +172,7 @@ PushEngine::respondToAlarm()
 {
     assert(!nextAddrGenEvent.scheduled());
     schedule(nextAddrGenEvent, nextCycle());
+    DPRINTF(MPU, "%s: Responded to an alarm.\n", __func__);
 }
 
 bool
@@ -201,9 +212,9 @@ PushEngine::processNextPushEvent()
                             curr_edge->neighbor, update_value);
 
     if (!reqPort.blocked()) {
-        DPRINTF(MPU, "%s: Send a push update to addr: %lu with value: %d.\n",
-                                __func__, curr_edge->neighbor, update_value);
         reqPort.sendPacket(update);
+        DPRINTF(MPU, "%s: Sent a push update to addr: %lu with value: %d.\n",
+                                __func__, curr_edge->neighbor, update_value);
         reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
         assert(reqOffsetMap[pkt->req] <= 64);
         reqNumEdgeMap[pkt->req]--;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index cca945ce0a..ad9e93ba60 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -146,7 +146,7 @@ WLEngine::processNextReadEvent()
     } else {
         // TODO: Generalize this to reduce function rather than just min
         DPRINTF(MPU, "%s: Found the addr: %lu in onTheFlyUpdateMap. "
-                    "onTheFlyUpdateMap[%lu] = %u.", __func__, update_addr,
+                    "onTheFlyUpdateMap[%lu] = %u.\n", __func__, update_addr,
                     update_addr, onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
                 std::min(update_value, onTheFlyUpdateMap[update_addr]);
@@ -231,7 +231,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     }
 
     updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
-    DPRINTF(MPU, "%s: Pushed an item to the front of updateQueue"
+    DPRINTF(MPU, "%s: Pushed an item to the back of updateQueue"
                                         ". updateQueue.size = %u.\n",
                                         __func__, updateQueue.size());
     delete pkt;

From 4f58d86c6eae6696ffaf735d5999400db0310d46 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 10 Apr 2022 16:42:27 -0700
Subject: [PATCH 085/247] Works!!!!!!

---
 configs/accl/sega.py               | 4 ++--
 src/accl/graph/TODO.md             | 6 ++++++
 src/accl/graph/sega/push_engine.cc | 8 ++++++++
 src/accl/graph/sega/push_engine.hh | 2 +-
 4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index e45580dd37..e68097ce74 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -11,7 +11,7 @@ def __init__(self, base_edge_addr):
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=16,
                                 on_the_fly_update_map_size=8)
-        self.interconnect = SystemXBar()
+        self.interconnect = SystemXBar(max_routing_table_size=16384)
 
         self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port
         self.interconnect.cpu_side_ports = self.push_engine.mem_port
@@ -40,7 +40,7 @@ def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
         self.edge_mem_ctrl = SimpleMemory(
             range=edge_range, bandwidth="25GB/s",
             latency="30ns", image_file=edge_binary)
-        self.interconnect = SystemXBar()
+        self.interconnect = SystemXBar(max_routing_table_size=16384)
 
         self.interconnect.mem_side_ports = self.vertex_mem_ctrl.port
         self.interconnect.mem_side_ports = self.edge_mem_ctrl.port
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index a0e2cefeff..f6d77d5e22 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -7,3 +7,9 @@ and memory atom size in the coalesce engine
 sure all the simobjects follow that architecture.
 * implement all the communications between simobjects as req/retry.
 * get rid of maps with RequestPtr as keys
+
+
+Advice from Jason:
+* use tryEnqueueMemReq that returns a boolean that shows if it has succeeded to enqueue the request.
+* if it
+* scratch all of these
\ No newline at end of file
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 3c1a98c69a..1fced87a43 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -106,6 +106,14 @@ PushEngine::ReqPort::recvReqRetry()
 bool
 PushEngine::recvWLItem(WorkListItem wl)
 {
+    // If there are no outdoing edges, no need to generate and push
+    // updates. Therefore, we only need to return true.
+    if (wl.degree == 0) {
+        DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n",
+                    __func__, wl.to_string());
+        return true;
+    }
+
     assert((pushReqQueueSize == 0) ||
         (pushReqQueue.size() <= pushReqQueueSize));
     if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 2aba0ca008..29d18709ee 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -61,7 +61,7 @@ class PushEngine : public BaseReadEngine
             Addr offset = _start - aligned_addr;
             int num_items = 0;
 
-            if (_end > (_start + _atom)) {
+            if (_end > (aligned_addr + _atom)) {
                 num_items = (_atom - offset) / _step;
             } else {
                 num_items = (_end - _start) / _step;

From b920f152ec5a935d159e0d36904e7dba5079a502 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 12 Apr 2022 09:59:31 -0700
Subject: [PATCH 086/247] Removing SystemXBar from config script. [has-bug]

---
 configs/accl/sega.py | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index e68097ce74..dd7623bfea 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -11,10 +11,6 @@ def __init__(self, base_edge_addr):
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=16,
                                 on_the_fly_update_map_size=8)
-        self.interconnect = SystemXBar(max_routing_table_size=16384)
-
-        self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port
-        self.interconnect.cpu_side_ports = self.push_engine.mem_port
 
     def getRespPort(self):
         return self.wl_engine.resp_port
@@ -26,10 +22,15 @@ def getReqPort(self):
     def setReqPort(self, port):
         self.push_engine.req_port = port
 
-    def getMemPort(self):
-        return self.interconnect.mem_side_ports
-    def setMemPort(self, port):
-        self.interconnect.mem_side_ports = port
+    def getVertexMemPort(self):
+        return self.coalesce_engine.mem_port
+    def setVertexMemPort(self, port):
+        self.coalesce_engine.mem_port = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
 
 class MPUMemory(SubSystem):
     def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
@@ -40,15 +41,16 @@ def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
         self.edge_mem_ctrl = SimpleMemory(
             range=edge_range, bandwidth="25GB/s",
             latency="30ns", image_file=edge_binary)
-        self.interconnect = SystemXBar(max_routing_table_size=16384)
 
-        self.interconnect.mem_side_ports = self.vertex_mem_ctrl.port
-        self.interconnect.mem_side_ports = self.edge_mem_ctrl.port
+    def getVertexPort(self):
+        return self.vertex_mem_ctrl.port
+    def setVertexPort(self, port):
+        self.vertex_mem_ctrl.port = port
 
-    def getPort(self):
-        return self.interconnect.cpu_side_ports
-    def setPort(self, port):
-        self.interconnect.cpu_side_ports = port
+    def getEdgePort(self):
+        return self.edge_mem_ctrl.port
+    def setEdgePort(self, port):
+        self.edge_mem_ctrl.port = port
 
 class SEGA(System):
     def __init__(self):
@@ -65,7 +67,8 @@ def __init__(self):
             edge_binary="facebook/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
-        self.mpu.setMemPort(self.mem_ctrl.getPort())
+        self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
+        self.mpu.setEdgeMemPort(self.mem_ctrl.getEdgePort())
 
 system = SEGA()
 root = Root(full_system = False, system = system)

From 58e3b63ea66d9709147566e3e72c882d9bd7216e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 12 Apr 2022 19:59:24 -0700
Subject: [PATCH 087/247] Fixing the bug when deallocating a taken line.

---
 configs/accl/sega.py                    |   4 +-
 src/accl/graph/base/BaseReadEngine.py   |   3 +
 src/accl/graph/base/base_read_engine.cc |   1 +
 src/accl/graph/base/base_read_engine.hh |   2 +
 src/accl/graph/sega/CoalesceEngine.py   |   2 +
 src/accl/graph/sega/coalesce_engine.cc  | 388 +++++++++++++-----------
 6 files changed, 222 insertions(+), 178 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index dd7623bfea..7f4663cc82 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -36,10 +36,10 @@ class MPUMemory(SubSystem):
     def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
         super(MPUMemory, self).__init__()
         self.vertex_mem_ctrl = SimpleMemory(
-            range=vertex_range, bandwidth="25GB/s",
+            range=vertex_range, bandwidth="19.2GB/s",
             latency="30ns", image_file=vertex_binary)
         self.edge_mem_ctrl = SimpleMemory(
-            range=edge_range, bandwidth="25GB/s",
+            range=edge_range, bandwidth="19.2GB/s",
             latency="30ns", image_file=edge_binary)
 
     def getVertexPort(self):
diff --git a/src/accl/graph/base/BaseReadEngine.py b/src/accl/graph/base/BaseReadEngine.py
index 3ddab2d3c4..d4ab622fd6 100644
--- a/src/accl/graph/base/BaseReadEngine.py
+++ b/src/accl/graph/base/BaseReadEngine.py
@@ -40,3 +40,6 @@ class BaseReadEngine(ClockedObject):
 
     outstanding_mem_req_queue_size = Param.Int(16, "Capacity of queue in "
                                     "which memory requests are queued.")
+
+    attached_memory_atom_size = Param.Int(64, "The atom size of the attached "
+                                    "memory.")
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index 1658d85627..19214a3bd1 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -36,6 +36,7 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
+    peerMemoryAtomSize(params.attached_memory_atom_size),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
     alarmRequested(false),
     spaceRequested(0),
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index 5275f86449..0cab95dbbb 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -68,6 +68,8 @@ class BaseReadEngine : public ClockedObject
     System* system;
     MemPort memPort;
 
+    int peerMemoryAtomSize;
+
     int outstandingMemReqQueueSize;
     bool alarmRequested;
     int spaceRequested;
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index bec7e3d233..3e5699f552 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -37,3 +37,5 @@ class CoalesceEngine(BaseReadEngine):
     peer_push_engine = Param.PushEngine(NULL, "")
     num_mshr_entry = Param.Int(4, "")
     num_tgts_per_mshr = Param.Int(20, "")
+
+    cache_size = Param.MemorySize("16KiB", "Size of the internal cache.")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index c740597a2c..41d1fe4953 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -88,7 +88,11 @@ CoalesceEngine::recvReadAddr(Addr addr)
             __func__, addr, block_index, wl_offset, responseQueue.size(),
             cacheBlocks[block_index].items[wl_offset].to_string());
         // TODO: Use a bitset instead of unsigned int for takenMask
+        DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+        DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
 
         stats.readHits++;
         stats.numVertexReads++;
@@ -144,7 +148,11 @@ CoalesceEngine::recvReadAddr(Addr addr)
                         return false;
                     }
                     cacheBlocks[block_index].addr = aligned_addr;
+                    DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
                     cacheBlocks[block_index].takenMask = 0;
+                    DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
@@ -256,7 +264,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHRMap[block_index].size(); i++) {
         Addr miss_addr = MSHRMap[block_index][i];
-        Addr aligned_miss_addr = (miss_addr / 64) * 64;
+        Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
 
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / 16;
@@ -269,7 +277,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
+            DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+            DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
             stats.numVertexReads++;
             servicedIndices.push_back(i);
             DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
@@ -336,7 +348,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     }
 
     cacheBlocks[block_index].items[wl_offset] = wl;
+    DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
+    DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
     stats.numVertexWrites++;
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
@@ -373,189 +389,209 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 __func__, block_index);
     DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and "
                 "then commited.\n", __func__, block_index);
-
-    if ((cacheBlocks[block_index].hasChange)&&
-        (cacheBlocks[block_index].hasConflict) &&
-        (memReqQueueHasSpace(2))) {
-        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                    __func__, block_index);
-    } else if ((cacheBlocks[block_index].hasChange) &&
-                (!cacheBlocks[block_index].hasConflict) &&
-                (memReqQueueHasSpace(1))) {
-        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                    __func__, block_index);
-    } else if ((!cacheBlocks[block_index].hasChange) &&
-                (cacheBlocks[block_index].hasConflict) &&
-                (memReqQueueHasSpace(1))) {
-        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                    __func__, block_index);
-    } else if ((!cacheBlocks[block_index].hasChange) &&
-                (!cacheBlocks[block_index].hasConflict)) {
-        DPRINTF(MPU, "%s: No ApplyAndCommit needed for cache line[%d].\n",
-                    __func__, block_index);
-    } else {
-        int spaceNeeded = cacheBlocks[block_index].hasConflict ? 2 : 1;
-        requestAlarm(spaceNeeded);
-        DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set "
-        "an alarm for nextApplyAndCommitEvent when there is %d space.\n",
-        __func__, spaceNeeded);
-        return;
-    }
-
-    // Reducing between tempProp and prop for each item in the cache line.
-    for (int i = 0; i < 4; i++) {
-        uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
-        cacheBlocks[block_index].items[i].prop = std::min(
-            cacheBlocks[block_index].items[i].prop,
-            cacheBlocks[block_index].items[i].tempProp);
-        DPRINTF(MPU, "%s: Applied cache line[%d][%d] = %s.\n", __func__,
-                    block_index, i,
-                    cacheBlocks[block_index].items[i].to_string());
-        if (old_prop != cacheBlocks[block_index].items[i].prop) {
-            changedMask |= (1 << i);
-            DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
-                        __func__, block_index, i);
+    if (cacheBlocks[block_index].takenMask == 0) {
+        if ((cacheBlocks[block_index].hasChange)&&
+            (cacheBlocks[block_index].hasConflict) &&
+            (memReqQueueHasSpace(2))) {
+            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                        __func__, block_index);
+        } else if ((cacheBlocks[block_index].hasChange) &&
+                    (!cacheBlocks[block_index].hasConflict) &&
+                    (memReqQueueHasSpace(1))) {
+            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                        __func__, block_index);
+        } else if ((!cacheBlocks[block_index].hasChange) &&
+                    (cacheBlocks[block_index].hasConflict) &&
+                    (memReqQueueHasSpace(1))) {
+            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                        __func__, block_index);
+        } else if ((!cacheBlocks[block_index].hasChange) &&
+                    (!cacheBlocks[block_index].hasConflict)) {
+            DPRINTF(MPU, "%s: No ApplyAndCommit needed for cache line[%d].\n",
+                        __func__, block_index);
+        } else {
+            int spaceNeeded = cacheBlocks[block_index].hasConflict ? 2 : 1;
+            requestAlarm(spaceNeeded);
+            DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set "
+            "an alarm for nextApplyAndCommitEvent when there is %d space.\n",
+            __func__, spaceNeeded);
+            return;
         }
-    }
 
-    if (cacheBlocks[block_index].hasChange) {
-        DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
-                    , __func__, block_index);
-        // TODO: Parameterize this 64 to memory atom size
-        PacketPtr write_pkt = createWritePacket(
-            cacheBlocks[block_index].addr, 64,
-            (uint8_t*) cacheBlocks[block_index].items);
-        DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n",
-                    __func__, write_pkt->getAddr());
-        if (cacheBlocks[block_index].hasConflict) {
-            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
-                        "enough space in outstandingMemReqQueue for the write "
-                        "back packet and its subsequent read packet.\n",
-                        __func__, block_index);
-            Addr miss_addr = MSHRMap[block_index][0];
-            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
-                        " Addr: %lu.\n", __func__, block_index, miss_addr);
-            // TODO: parameterize 64
-            Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
-            PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
-            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                        " req addr (aligned_addr) = %lu, size = 64.\n",
-                        __func__, miss_addr, aligned_miss_addr);
-
-            enqueueMemReq(write_pkt);
-            stats.numVertexBlockWrites++;
-            enqueueMemReq(read_pkt);
-            DPRINTF(MPU, "%s: Added the evicting write back packet along with "
-                        "its subsequent read packet (to service the conflicts)"
-                        " to outstandingMemReqQueue.\n" , __func__);
-
-            // TODO: This should be improved
-            if ((changedMask & (1)) == 1) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 0);
-            }
-            if ((changedMask & (2)) == 2) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 1);
-            }
-            if ((changedMask & (4)) == 4) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 2);
+        // Reducing between tempProp and prop for each item in the cache line.
+        for (int i = 0; i < 4; i++) {
+            uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
+            cacheBlocks[block_index].items[i].prop = std::min(
+                cacheBlocks[block_index].items[i].prop,
+                cacheBlocks[block_index].items[i].tempProp);
+            DPRINTF(MPU, "%s: Applied cache line[%d][%d] = %s.\n", __func__,
+                        block_index, i,
+                        cacheBlocks[block_index].items[i].to_string());
+            if (old_prop != cacheBlocks[block_index].items[i].prop) {
+                changedMask |= (1 << i);
+                DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
+                            __func__, block_index, i);
             }
-            if ((changedMask & (8)) == 8) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 3);
+        }
+
+        if (cacheBlocks[block_index].hasChange) {
+            DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
+                        , __func__, block_index);
+            // TODO: Parameterize this 64 to memory atom size
+            PacketPtr write_pkt = createWritePacket(
+                cacheBlocks[block_index].addr, 64,
+                (uint8_t*) cacheBlocks[block_index].items);
+            DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n",
+                        __func__, write_pkt->getAddr());
+            if (cacheBlocks[block_index].hasConflict) {
+                DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
+                            "enough space in outstandingMemReqQueue for the write "
+                            "back packet and its subsequent read packet.\n",
+                            __func__, block_index);
+                Addr miss_addr = MSHRMap[block_index][0];
+                DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
+                            " Addr: %lu.\n", __func__, block_index, miss_addr);
+                // TODO: parameterize 64
+                Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
+                PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                            " req addr (aligned_addr) = %lu, size = 64.\n",
+                            __func__, miss_addr, aligned_miss_addr);
+
+                enqueueMemReq(write_pkt);
+                stats.numVertexBlockWrites++;
+                enqueueMemReq(read_pkt);
+                DPRINTF(MPU, "%s: Added the evicting write back packet along with "
+                            "its subsequent read packet (to service the conflicts)"
+                            " to outstandingMemReqQueue.\n" , __func__);
+
+                // TODO: This should be improved
+                if ((changedMask & (1)) == 1) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 0);
+                }
+                if ((changedMask & (2)) == 2) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 1);
+                }
+                if ((changedMask & (4)) == 4) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 2);
+                }
+                if ((changedMask & (8)) == 8) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 3);
+                }
+                // TODO: This should be improved
+
+                cacheBlocks[block_index].addr = aligned_miss_addr;
+                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].takenMask = 0;
+                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].allocated = true;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = true;
+                cacheBlocks[block_index].hasChange = false;
+                DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                            " = %u.\n", __func__, evictQueue.size());
+            } else {
+                DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
+                        "enough space in outstandingMemReqQueue for the write back"
+                        " packet.\n", __func__, block_index);
+                enqueueMemReq(write_pkt);
+                stats.numVertexBlockWrites++;
+                DPRINTF(MPU, "%s: Added the write back packet to "
+                            "outstandingMemReqQueue.\n", __func__);
+
+                // TODO: This should be improved
+                if ((changedMask & (1)) == 1) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 0);
+                }
+                if ((changedMask & (2)) == 2) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 1);
+                }
+                if ((changedMask & (4)) == 4) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 2);
+                }
+                if ((changedMask & (8)) == 8) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 3);
+                }
+
+                // Since allocated is false, does not matter what the address is.
+                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].takenMask = 0;
+                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].allocated = false;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = false;
+                cacheBlocks[block_index].hasChange = false;
+                DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                            " = %u.\n", __func__, evictQueue.size());
             }
-            // TODO: This should be improved
-
-            cacheBlocks[block_index].addr = aligned_miss_addr;
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = true;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = true;
-            cacheBlocks[block_index].hasChange = false;
-            DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                        " = %u.\n", __func__, evictQueue.size());
         } else {
-            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
-                    "enough space in outstandingMemReqQueue for the write back"
-                    " packet.\n", __func__, block_index);
-            enqueueMemReq(write_pkt);
-            stats.numVertexBlockWrites++;
-            DPRINTF(MPU, "%s: Added the write back packet to "
-                        "outstandingMemReqQueue.\n", __func__);
-
-            // TODO: This should be improved
-            if ((changedMask & (1)) == 1) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 0);
-            }
-            if ((changedMask & (2)) == 2) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 1);
-            }
-            if ((changedMask & (4)) == 4) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 2);
-            }
-            if ((changedMask & (8)) == 8) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 3);
+            DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write "
+                        "backs are necessary.\n", __func__, block_index);
+            if (cacheBlocks[block_index].hasConflict) {
+                DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
+                            "enough space in outstandingMemReqQueue for the write "
+                            "back packet and its subsequent read packet.\n",
+                            __func__, block_index);
+                Addr miss_addr = MSHRMap[block_index][0];
+                DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
+                            " Addr: %lu.\n", __func__, block_index, miss_addr);
+                // TODO: parameterize 64
+                Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
+                PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                            " req addr (aligned_addr) = %lu, size = 64.\n",
+                            __func__, miss_addr, aligned_miss_addr);
+                enqueueMemReq(read_pkt);
+
+                cacheBlocks[block_index].addr = aligned_miss_addr;
+                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].takenMask = 0;
+                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].allocated = true;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = true;
+                cacheBlocks[block_index].hasChange = false;
+            } else {
+                DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just "
+                            "deallocating the line.\n", __func__, block_index);
+
+                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].takenMask = 0;
+                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].allocated = false;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = false;
+                cacheBlocks[block_index].hasChange = false;
             }
-
-            // Since allocated is false, does not matter what the address is.
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = false;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = false;
-            cacheBlocks[block_index].hasChange = false;
-            DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                        " = %u.\n", __func__, evictQueue.size());
         }
     } else {
-        DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write "
-                    "backs are necessary.\n",
-                    __func__, block_index, block_index);
-        if (cacheBlocks[block_index].hasConflict) {
-            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
-                        "enough space in outstandingMemReqQueue for the write "
-                        "back packet and its subsequent read packet.\n",
-                        __func__, block_index);
-            Addr miss_addr = MSHRMap[block_index][0];
-            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
-                        " Addr: %lu.\n", __func__, block_index, miss_addr);
-            // TODO: parameterize 64
-            Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
-            PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
-            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                        " req addr (aligned_addr) = %lu, size = 64.\n",
-                        __func__, miss_addr, aligned_miss_addr);
-            enqueueMemReq(read_pkt);
-
-            cacheBlocks[block_index].addr = aligned_miss_addr;
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = true;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = true;
-            cacheBlocks[block_index].hasChange = false;
-        } else {
-            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just "
-                        "deallocating the line.\n", __func__, block_index);
-
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = false;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = false;
-            cacheBlocks[block_index].hasChange = false;
-        }
+        DPRINTF(MPU, "%s: cache line[%d] has been read since being scheduled "
+                    "for eviction. Therefore, ignoring the evict schedule.\n",
+                    __func__, block_index);
     }
 
     evictQueue.pop_front();

From 6e7cb504f2c0e2db7e4d1b417994ab53e200ff7c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 13 Apr 2022 09:46:44 -0700
Subject: [PATCH 088/247] Parameterizing cache_size and memory_atom_size.

---
 src/accl/graph/TODO.md                  |  12 ---
 src/accl/graph/base/base_read_engine.cc |   7 +-
 src/accl/graph/base/base_read_engine.hh |   4 +-
 src/accl/graph/sega/CoalesceEngine.py   |   5 +-
 src/accl/graph/sega/coalesce_engine.cc  | 127 +++++++++---------------
 src/accl/graph/sega/coalesce_engine.hh  |  16 +--
 src/accl/graph/sega/push_engine.cc      |   9 +-
 7 files changed, 74 insertions(+), 106 deletions(-)

diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index f6d77d5e22..1cec4dc6f9 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -1,15 +1,3 @@
 # TODO Items
-
-* use setLE/setBE inside createUpdatePacket and createWritePacket
-* parameterize cache size, associativity, maybe latencies,
-and memory atom size in the coalesce engine
-* look at all the simobjects and come up with a general architecture. Make
-sure all the simobjects follow that architecture.
 * implement all the communications between simobjects as req/retry.
 * get rid of maps with RequestPtr as keys
-
-
-Advice from Jason:
-* use tryEnqueueMemReq that returns a boolean that shows if it has succeeded to enqueue the request.
-* if it
-* scratch all of these
\ No newline at end of file
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index 19214a3bd1..714a4542f1 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -36,12 +36,12 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
-    peerMemoryAtomSize(params.attached_memory_atom_size),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
     alarmRequested(false),
     spaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
-    _requestorId(system->getRequestorId(this))
+    _requestorId(system->getRequestorId(this)),
+    peerMemoryAtomSize(params.attached_memory_atom_size)
 {}
 
 BaseReadEngine::~BaseReadEngine()
@@ -101,6 +101,9 @@ BaseReadEngine::processNextMemReqEvent()
     // TODO: Maybe add a DPRINTF here.
     PacketPtr pkt = outstandingMemReqQueue.front();
     memPort.sendPacket(pkt);
+    DPRINTF(MPU, "%s: Sent a packet to memory with the following info. "
+                "pkt->addr: %lu, pkt->size: %lu.\n",
+                __func__, pkt->getAddr(), pkt->getSize());
     outstandingMemReqQueue.pop_front();
 
     if (alarmRequested &&
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index 0cab95dbbb..f11459ad6e 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -68,8 +68,6 @@ class BaseReadEngine : public ClockedObject
     System* system;
     MemPort memPort;
 
-    int peerMemoryAtomSize;
-
     int outstandingMemReqQueueSize;
     bool alarmRequested;
     int spaceRequested;
@@ -81,6 +79,8 @@ class BaseReadEngine : public ClockedObject
   protected:
     const RequestorID _requestorId;
 
+    size_t peerMemoryAtomSize;
+
     void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
 
     bool memReqQueueHasSpace(int space);
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 3e5699f552..faa5295ed7 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -35,7 +35,10 @@ class CoalesceEngine(BaseReadEngine):
     cxx_class = 'gem5::CoalesceEngine'
 
     peer_push_engine = Param.PushEngine(NULL, "")
+
+    cache_size = Param.MemorySize("16KiB", "Size of the internal cache.")
+
     num_mshr_entry = Param.Int(4, "")
     num_tgts_per_mshr = Param.Int(20, "")
 
-    cache_size = Param.MemorySize("16KiB", "Size of the internal cache.")
+
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 41d1fe4953..4d152e375d 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -38,21 +38,17 @@ namespace gem5
 CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     BaseReadEngine(params),
     peerPushEngine(params.peer_push_engine),
+    numLines((int) (params.cache_size / peerMemoryAtomSize)),
+    numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
     stats(*this)
-{}
-
-void
-CoalesceEngine::startup()
 {
-    for (int i = 0; i < 256; i++) {
-        cacheBlocks[i].takenMask = 0;
-        cacheBlocks[i].allocated = false;
-        cacheBlocks[i].valid = false;
-        cacheBlocks[i].hasConflict = false;
+    cacheBlocks = new Block [numLines];
+    for (int i = 0; i < numLines; i++) {
+        cacheBlocks[i] = Block(numElementsPerLine);
     }
 }
 
@@ -74,8 +70,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
     assert(MSHRMap.size() <= numMSHREntry);
     DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
-    Addr aligned_addr = (addr / 64) * 64;
-    int block_index = (aligned_addr / 64) % 256;
+    Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
+    int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
@@ -162,11 +158,11 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     MSHRMap[block_index].push_back(addr);
                     DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
                                 "line[%d].\n", __func__, addr, block_index);
-                    // TODO: Parameterize 64 to memory atom size
-                    PacketPtr pkt = createReadPacket(aligned_addr, 64);
+
+                    PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
                     DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                                " req addr (aligned_addr) = %lu, size = 64.\n",
-                                __func__, addr, aligned_addr);
+                                " req addr (aligned_addr) = %lu, size = %d.\n",
+                                __func__, addr, aligned_addr, peerMemoryAtomSize);
                     enqueueMemReq(pkt);
                     DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue.\n",
                                                                     __func__);
@@ -240,10 +236,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
     Addr addr = pkt->getAddr();
     uint8_t* data = pkt->getPtr<uint8_t>();
-    // TODO: After parameterizing the cache size
-    // this 256 number should change to the cache
-    // size parameter.
-    int block_index = (addr / 64) % 256;
+
+    int block_index = (addr / peerMemoryAtomSize) % numLines;
 
     DPRINTF(MPU, "%s: Received a read resposne for Addr: %lu.\n",
                 __func__, pkt->getAddr());
@@ -264,10 +258,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHRMap[block_index].size(); i++) {
         Addr miss_addr = MSHRMap[block_index][i];
-        Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
+        Addr aligned_miss_addr = std::floor(miss_addr / peerMemoryAtomSize) * peerMemoryAtomSize;
 
         if (aligned_miss_addr == addr) {
-            int wl_offset = (miss_addr - aligned_miss_addr) / 16;
+            int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
             DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
                         "be serviced with the received packet.\n",
                         __func__, miss_addr, block_index);
@@ -334,9 +328,9 @@ void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
     // TODO: Parameterize all the numbers here.
-    Addr aligned_addr = std::floor(addr / 64) * 64;
-    int block_index = (aligned_addr / 64) % 256;
-    int wl_offset = (addr - aligned_addr) / 16;
+    Addr aligned_addr = std::floor(addr / peerMemoryAtomSize) * peerMemoryAtomSize;
+    int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
     DPRINTF(MPU, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
                 __func__, wl.to_string(), addr);
@@ -437,12 +431,12 @@ CoalesceEngine::processNextApplyAndCommitEvent()
         if (cacheBlocks[block_index].hasChange) {
             DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
                         , __func__, block_index);
-            // TODO: Parameterize this 64 to memory atom size
+
             PacketPtr write_pkt = createWritePacket(
-                cacheBlocks[block_index].addr, 64,
+                cacheBlocks[block_index].addr, peerMemoryAtomSize,
                 (uint8_t*) cacheBlocks[block_index].items);
-            DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n",
-                        __func__, write_pkt->getAddr());
+            DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = %d.\n",
+                        __func__, write_pkt->getAddr(), peerMemoryAtomSize);
             if (cacheBlocks[block_index].hasConflict) {
                 DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
                             "enough space in outstandingMemReqQueue for the write "
@@ -451,12 +445,15 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 Addr miss_addr = MSHRMap[block_index][0];
                 DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
                             " Addr: %lu.\n", __func__, block_index, miss_addr);
-                // TODO: parameterize 64
-                Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
-                PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+
+                Addr aligned_miss_addr =
+                    std::floor(miss_addr / peerMemoryAtomSize) *
+                    peerMemoryAtomSize;
+                PacketPtr read_pkt = createReadPacket(
+                    aligned_miss_addr, peerMemoryAtomSize);
                 DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                            " req addr (aligned_addr) = %lu, size = 64.\n",
-                            __func__, miss_addr, aligned_miss_addr);
+                            " req addr (aligned_addr) = %lu, size = %d.\n",
+                            __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize);
 
                 enqueueMemReq(write_pkt);
                 stats.numVertexBlockWrites++;
@@ -465,28 +462,13 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                             "its subsequent read packet (to service the conflicts)"
                             " to outstandingMemReqQueue.\n" , __func__);
 
-                // TODO: This should be improved
-                if ((changedMask & (1)) == 1) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 0);
-                }
-                if ((changedMask & (2)) == 2) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 1);
-                }
-                if ((changedMask & (4)) == 4) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 2);
-                }
-                if ((changedMask & (8)) == 8) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 3);
+                for (int i = 0; i < numElementsPerLine; i++) {
+                    if ((changedMask & (1 << i)) == (1 << i)) {
+                        peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
+                        DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                    __func__, block_index, i);
+                    }
                 }
-                // TODO: This should be improved
 
                 cacheBlocks[block_index].addr = aligned_miss_addr;
                 DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
@@ -509,26 +491,12 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 DPRINTF(MPU, "%s: Added the write back packet to "
                             "outstandingMemReqQueue.\n", __func__);
 
-                // TODO: This should be improved
-                if ((changedMask & (1)) == 1) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 0);
-                }
-                if ((changedMask & (2)) == 2) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 1);
-                }
-                if ((changedMask & (4)) == 4) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 2);
-                }
-                if ((changedMask & (8)) == 8) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 3);
+                for (int i = 0; i < numElementsPerLine; i++) {
+                    if ((changedMask & (1 << i)) == (1 << i)) {
+                        peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
+                        DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                    __func__, block_index, i);
+                    }
                 }
 
                 // Since allocated is false, does not matter what the address is.
@@ -555,11 +523,14 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 Addr miss_addr = MSHRMap[block_index][0];
                 DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
                             " Addr: %lu.\n", __func__, block_index, miss_addr);
-                // TODO: parameterize 64
-                Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
-                PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+
+                Addr aligned_miss_addr =
+                    std::floor(miss_addr / peerMemoryAtomSize) *
+                    peerMemoryAtomSize;
+                PacketPtr read_pkt = createReadPacket(
+                        aligned_miss_addr, peerMemoryAtomSize);
                 DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                            " req addr (aligned_addr) = %lu, size = 64.\n",
+                            " req addr (aligned_addr) = %lu, size = %d.\n",
                             __func__, miss_addr, aligned_miss_addr);
                 enqueueMemReq(read_pkt);
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 6a8aadcbae..0ddbdfdeb1 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -47,7 +47,7 @@ class CoalesceEngine : public BaseReadEngine
   private:
     struct Block
     {
-        WorkListItem items[4];
+        WorkListItem* items;
         Addr addr;
         uint8_t takenMask;
         bool allocated;
@@ -56,20 +56,26 @@ class CoalesceEngine : public BaseReadEngine
         bool hasChange;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
-        Block():
+        Block() {}
+        Block(int num_elements):
           addr(0),
           takenMask(0),
           allocated(false),
           valid(false),
           hasConflict(false),
           hasChange(false)
-        {}
+        {
+          items = new WorkListItem [num_elements];
+        }
     };
 
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
 
-    Block cacheBlocks[256];
+    Block* cacheBlocks;
+
+    int numLines;
+    int numElementsPerLine;
 
     int numMSHREntry;
     int numTgtsPerMSHR;
@@ -79,8 +85,6 @@ class CoalesceEngine : public BaseReadEngine
 
     std::deque<int> evictQueue;
 
-    virtual void startup();
-
     PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
     // PacketPtr createWritePacket(Addr addr, unsigned int size, WorkListItem wl);
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 1fced87a43..8dcbac0dcc 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -124,8 +124,7 @@ PushEngine::recvWLItem(WorkListItem wl)
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
     uint32_t value = wl.prop;
 
-    // TODO: parameterize 64 to memory atom size
-    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), 64, value);
+    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), peerMemoryAtomSize, value);
 
     assert(!pushReqQueue.empty());
     if ((!nextAddrGenEvent.scheduled()) &&
@@ -148,7 +147,7 @@ PushEngine::processNextAddrGenEvent()
                 "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
                 "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
-    PacketPtr pkt = createReadPacket(aligned_addr, 64);
+    PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
     reqOffsetMap[pkt->req] = offset;
     reqNumEdgeMap[pkt->req] = num_edges;
     reqValueMap[pkt->req] = curr_info.value();
@@ -202,7 +201,7 @@ PushEngine::processNextPushEvent()
     uint8_t* data = pkt->getPtr<uint8_t>();
 
     Addr offset = reqOffsetMap[pkt->req];
-    assert(offset < 64);
+    assert(offset < peerMemoryAtomSize);
     uint32_t value = reqValueMap[pkt->req];
 
     DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
@@ -224,7 +223,7 @@ PushEngine::processNextPushEvent()
         DPRINTF(MPU, "%s: Sent a push update to addr: %lu with value: %d.\n",
                                 __func__, curr_edge->neighbor, update_value);
         reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
-        assert(reqOffsetMap[pkt->req] <= 64);
+        assert(reqOffsetMap[pkt->req] <= peerMemoryAtomSize);
         reqNumEdgeMap[pkt->req]--;
         assert(reqNumEdgeMap[pkt->req] >= 0);
     }

From c216819f0ff4c7103a6f62e416f897068a460e52 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 13 Apr 2022 10:21:28 -0700
Subject: [PATCH 089/247] Renaming BaseReadEngine to BaseMemEngine.

---
 configs/accl/sega.py                          |  6 ++-
 .../{BaseReadEngine.py => BaseMemEngine.py}   |  8 ++--
 src/accl/graph/base/SConscript                |  4 +-
 ...base_read_engine.cc => base_mem_engine.cc} | 30 +++++++-------
 ...base_read_engine.hh => base_mem_engine.hh} | 20 +++++-----
 src/accl/graph/base/data_structs.hh           |  6 +--
 src/accl/graph/sega/CoalesceEngine.py         |  4 +-
 src/accl/graph/sega/PushEngine.py             |  4 +-
 src/accl/graph/sega/coalesce_engine.cc        | 39 ++-----------------
 src/accl/graph/sega/coalesce_engine.hh        |  4 +-
 src/accl/graph/sega/push_engine.cc            |  4 +-
 src/accl/graph/sega/push_engine.hh            |  4 +-
 12 files changed, 52 insertions(+), 81 deletions(-)
 rename src/accl/graph/base/{BaseReadEngine.py => BaseMemEngine.py} (92%)
 rename src/accl/graph/base/{base_read_engine.cc => base_mem_engine.cc} (87%)
 rename src/accl/graph/base/{base_read_engine.hh => base_mem_engine.hh} (88%)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7f4663cc82..7d8b96490d 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -5,9 +5,11 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=16)
+                                    push_req_queue_size=16,
+                                    attached_memory_atom_size=64)
         self.coalesce_engine = CoalesceEngine(
-                                    peer_push_engine=self.push_engine)
+                                    peer_push_engine=self.push_engine,
+                                    attached_memory_atom_size=64)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=16,
                                 on_the_fly_update_map_size=8)
diff --git a/src/accl/graph/base/BaseReadEngine.py b/src/accl/graph/base/BaseMemEngine.py
similarity index 92%
rename from src/accl/graph/base/BaseReadEngine.py
rename to src/accl/graph/base/BaseMemEngine.py
index d4ab622fd6..69f68e9dfc 100644
--- a/src/accl/graph/base/BaseReadEngine.py
+++ b/src/accl/graph/base/BaseMemEngine.py
@@ -29,11 +29,11 @@
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
-class BaseReadEngine(ClockedObject):
+class BaseMemEngine(ClockedObject):
     abstract = True
-    type = 'BaseReadEngine'
-    cxx_header = "accl/graph/base/base_read_engine.hh"
-    cxx_class = 'gem5::BaseReadEngine'
+    type = 'BaseMemEngine'
+    cxx_header = "accl/graph/base/base_mem_engine.hh"
+    cxx_class = 'gem5::BaseMemEngine'
 
     system = Param.System(Parent.any, 'System this Engine is a part of')
     mem_port  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index ea96f4323b..4c90dfa9a6 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,8 +27,8 @@
 
 Import('*')
 
-SimObject('BaseReadEngine.py')
+SimObject('BaseMemEngine.py')
 SimObject('BaseReduceEngine.py')
 
-Source('base_read_engine.cc')
+Source('base_mem_engine.cc')
 Source('base_reduce_engine.cc')
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_mem_engine.cc
similarity index 87%
rename from src/accl/graph/base/base_read_engine.cc
rename to src/accl/graph/base/base_mem_engine.cc
index 714a4542f1..50e64ae7c3 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -26,13 +26,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/graph/base/base_read_engine.hh"
+#include "accl/graph/base/base_mem_engine.hh"
 
 #include "debug/MPU.hh"
 namespace gem5
 {
 
-BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
+BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
@@ -44,11 +44,11 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
     peerMemoryAtomSize(params.attached_memory_atom_size)
 {}
 
-BaseReadEngine::~BaseReadEngine()
+BaseMemEngine::~BaseMemEngine()
 {}
 
 Port&
-BaseReadEngine::getPort(const std::string &if_name, PortID idx)
+BaseMemEngine::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name == "mem_port") {
         return memPort;
@@ -58,7 +58,7 @@ BaseReadEngine::getPort(const std::string &if_name, PortID idx)
 }
 
 void
-BaseReadEngine::MemPort::sendPacket(PacketPtr pkt)
+BaseMemEngine::MemPort::sendPacket(PacketPtr pkt)
 {
     panic_if(_blocked, "Should never try to send if blocked MemSide!");
     // If we can't send the packet across the port, store it for later.
@@ -70,14 +70,14 @@ BaseReadEngine::MemPort::sendPacket(PacketPtr pkt)
 }
 
 bool
-BaseReadEngine::MemPort::recvTimingResp(PacketPtr pkt)
+BaseMemEngine::MemPort::recvTimingResp(PacketPtr pkt)
 {
     //TODO: Investigate sending true all the time
     return owner->handleMemResp(pkt);
 }
 
 void
-BaseReadEngine::MemPort::recvReqRetry()
+BaseMemEngine::MemPort::recvReqRetry()
 {
     panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
 
@@ -92,7 +92,7 @@ BaseReadEngine::MemPort::recvReqRetry()
 }
 
 void
-BaseReadEngine::processNextMemReqEvent()
+BaseMemEngine::processNextMemReqEvent()
 {
     if (memPort.blocked()) {
         return;
@@ -120,7 +120,7 @@ BaseReadEngine::processNextMemReqEvent()
 }
 
 PacketPtr
-BaseReadEngine::createReadPacket(Addr addr, unsigned int size)
+BaseMemEngine::createReadPacket(Addr addr, unsigned int size)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
     // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
@@ -135,7 +135,7 @@ BaseReadEngine::createReadPacket(Addr addr, unsigned int size)
 }
 
 PacketPtr
-BaseReadEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
+BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
 
@@ -151,7 +151,7 @@ BaseReadEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 }
 
 bool
-BaseReadEngine::memReqQueueHasSpace(int space)
+BaseMemEngine::memReqQueueHasSpace(int space)
 {
     assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
     return (
@@ -160,14 +160,14 @@ BaseReadEngine::memReqQueueHasSpace(int space)
 }
 
 bool
-BaseReadEngine::memReqQueueFull()
+BaseMemEngine::memReqQueueFull()
 {
     assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
     return (outstandingMemReqQueue.size() == outstandingMemReqQueueSize);
 }
 
 void
-BaseReadEngine::enqueueMemReq(PacketPtr pkt)
+BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 {
     panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n");
     outstandingMemReqQueue.push_back(pkt);
@@ -179,7 +179,7 @@ BaseReadEngine::enqueueMemReq(PacketPtr pkt)
 }
 
 void
-BaseReadEngine::requestAlarm(int space) {
+BaseMemEngine::requestAlarm(int space) {
     panic_if((alarmRequested == true) || (spaceRequested != 0),
             "You should not request another alarm without the first one being"
             "responded to.\n");
@@ -189,7 +189,7 @@ BaseReadEngine::requestAlarm(int space) {
 }
 
 void
-BaseReadEngine::wakeUp()
+BaseMemEngine::wakeUp()
 {
     if ((!nextMemReqEvent.scheduled()) &&
         (!outstandingMemReqQueue.empty())) {
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_mem_engine.hh
similarity index 88%
rename from src/accl/graph/base/base_read_engine.hh
rename to src/accl/graph/base/base_mem_engine.hh
index f11459ad6e..fb7cab91b0 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -26,33 +26,33 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
 
 #include <unordered_map>
 
 #include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
-#include "params/BaseReadEngine.hh"
+#include "params/BaseMemEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
 
 namespace gem5
 {
 
-class BaseReadEngine : public ClockedObject
+class BaseMemEngine : public ClockedObject
 {
   private:
     class MemPort : public RequestPort
     {
       private:
-        BaseReadEngine* owner;
+        BaseMemEngine* owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
         public:
-        MemPort(const std::string& name, BaseReadEngine* owner):
+        MemPort(const std::string& name, BaseMemEngine* owner):
             RequestPort(name, owner), owner(owner),
             _blocked(false), blockedPacket(nullptr)
         {}
@@ -96,10 +96,10 @@ class BaseReadEngine : public ClockedObject
     PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
 
   public:
-    PARAMS(BaseReadEngine);
+    PARAMS(BaseMemEngine);
 
-    BaseReadEngine(const BaseReadEngineParams &params);
-    ~BaseReadEngine();
+    BaseMemEngine(const BaseMemEngineParams &params);
+    ~BaseMemEngine();
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
@@ -116,4 +116,4 @@ class BaseReadEngine : public ClockedObject
 
 }
 
-#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
+#endif // __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 28a503528f..409245eeaa 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_GRAPH_BASE_UTIL_HH__
-#define __ACCL_GRAPH_BASE_UTIL_HH__
+#ifndef __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
+#define __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
 
 #include "base/cprintf.hh"
 
@@ -83,4 +83,4 @@ struct __attribute__ ((packed)) Edge
 
 }
 
-#endif // __ACCL_GRAPH_BASE_UTIL_HH__
+#endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index faa5295ed7..086f284950 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -27,9 +27,9 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.BaseReadEngine import BaseReadEngine
+from m5.objects.BaseMemEngine import BaseMemEngine
 
-class CoalesceEngine(BaseReadEngine):
+class CoalesceEngine(BaseMemEngine):
     type = 'CoalesceEngine'
     cxx_header = "accl/graph/sega/coalesce_engine.hh"
     cxx_class = 'gem5::CoalesceEngine'
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 645bc5f4ea..d3276799aa 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -27,9 +27,9 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.BaseReadEngine import BaseReadEngine
+from m5.objects.BaseMemEngine import BaseMemEngine
 
-class PushEngine(BaseReadEngine):
+class PushEngine(BaseMemEngine):
     type = 'PushEngine'
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 4d152e375d..1c5dee8b8f 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -36,7 +36,7 @@ namespace gem5
 {
 
 CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
-    BaseReadEngine(params),
+    BaseMemEngine(params),
     peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
@@ -83,12 +83,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
             "to responseQueue. responseQueue.size = %d.\n",
             __func__, addr, block_index, wl_offset, responseQueue.size(),
             cacheBlocks[block_index].items[wl_offset].to_string());
-        // TODO: Use a bitset instead of unsigned int for takenMask
-        DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
+
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
-        DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
 
         stats.readHits++;
         stats.numVertexReads++;
@@ -144,11 +140,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                         return false;
                     }
                     cacheBlocks[block_index].addr = aligned_addr;
-                    DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
                     cacheBlocks[block_index].takenMask = 0;
-                    DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
@@ -271,11 +263,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
-            DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
-            DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
             stats.numVertexReads++;
             servicedIndices.push_back(i);
             DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
@@ -342,11 +330,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     }
 
     cacheBlocks[block_index].items[wl_offset] = wl;
-    DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
-    DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
     stats.numVertexWrites++;
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
@@ -413,7 +397,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
         }
 
         // Reducing between tempProp and prop for each item in the cache line.
-        for (int i = 0; i < 4; i++) {
+        for (int i = 0; i < numElementsPerLine; i++) {
             uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
             cacheBlocks[block_index].items[i].prop = std::min(
                 cacheBlocks[block_index].items[i].prop,
@@ -471,11 +455,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 }
 
                 cacheBlocks[block_index].addr = aligned_miss_addr;
-                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].takenMask = 0;
-                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].allocated = true;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = true;
@@ -500,11 +480,8 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 }
 
                 // Since allocated is false, does not matter what the address is.
-                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
+
                 cacheBlocks[block_index].takenMask = 0;
-                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].allocated = false;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = false;
@@ -535,11 +512,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 enqueueMemReq(read_pkt);
 
                 cacheBlocks[block_index].addr = aligned_miss_addr;
-                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].takenMask = 0;
-                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].allocated = true;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = true;
@@ -548,11 +521,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just "
                             "deallocating the line.\n", __func__, block_index);
 
-                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].takenMask = 0;
-                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].allocated = false;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = false;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 0ddbdfdeb1..4c4cb4567b 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -29,7 +29,7 @@
 #ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
-#include "accl/graph/base/base_read_engine.hh"
+#include "accl/graph/base/base_mem_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "base/statistics.hh"
@@ -42,7 +42,7 @@ namespace gem5
 
 class WLEngine;
 
-class CoalesceEngine : public BaseReadEngine
+class CoalesceEngine : public BaseMemEngine
 {
   private:
     struct Block
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 8dcbac0dcc..53cb428b12 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -35,7 +35,7 @@ namespace gem5
 {
 
 PushEngine::PushEngine(const PushEngineParams &params):
-    BaseReadEngine(params),
+    BaseMemEngine(params),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
@@ -49,7 +49,7 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     if (if_name == "req_port") {
         return reqPort;
     } else if (if_name == "mem_port") {
-        return BaseReadEngine::getPort(if_name, idx);
+        return BaseMemEngine::getPort(if_name, idx);
     } else {
         return SimObject::getPort(if_name, idx);
     }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 29d18709ee..5e8b079d88 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -29,14 +29,14 @@
 #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
-#include "accl/graph/base/base_read_engine.hh"
+#include "accl/graph/base/base_mem_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
 {
 
-class PushEngine : public BaseReadEngine
+class PushEngine : public BaseMemEngine
 {
   private:
     class PushPacketInfoGen {

From 293cb52c7cd6175ee9f5e8e279a363b781ca0b15 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 13 Apr 2022 10:30:08 -0700
Subject: [PATCH 090/247] Adding a new SConscript for src/accl.

---
 configs/accl/sega.py           |  4 ++--
 src/accl/graph/SConscript      | 30 ++++++++++++++++++++++++++++++
 src/accl/graph/sega/SConscript |  2 +-
 3 files changed, 33 insertions(+), 3 deletions(-)
 create mode 100644 src/accl/graph/SConscript

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7d8b96490d..4168217f4d 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -64,9 +64,9 @@ def __init__(self):
         self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="facebook/graph_binaries/vertices",
+            vertex_binary="graphs/facebook/graph_binaries/vertices",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="facebook/graph_binaries/edgelist_0")
+            edge_binary="graphs/facebook/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
new file mode 100644
index 0000000000..00fa2466dd
--- /dev/null
+++ b/src/accl/graph/SConscript
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import('*')
+
+DebugFlag('MPU')
\ No newline at end of file
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 9b4629838b..6e563b2677 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -35,4 +35,4 @@ Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
 
-DebugFlag('MPU')
+DebugFlag('WLWrites')

From 5df2ae29e0faaa80cda5721ad137cdc84b6235e8 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 13 Apr 2022 14:11:32 -0700
Subject: [PATCH 091/247] Fixing stats and adding a few new ones.

---
 configs/accl/sega.py                   |  4 +--
 src/accl/graph/sega/SConscript         |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 43 ++++++++++++++++++--------
 src/accl/graph/sega/coalesce_engine.hh |  4 +--
 4 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 4168217f4d..0532aa2153 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -64,9 +64,9 @@ def __init__(self):
         self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="graphs/facebook/graph_binaries/vertices",
+            vertex_binary="graphs/epinions/graph_binaries/vertices",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="graphs/facebook/graph_binaries/edgelist_0")
+            edge_binary="graphs/epinions/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 6e563b2677..19d702c49a 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -35,4 +35,4 @@ Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
 
-DebugFlag('WLWrites')
+DebugFlag('ApplyUpdates')
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 1c5dee8b8f..36a7ddb6d2 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/coalesce_engine.hh"
 
 #include "accl/graph/sega/wl_engine.hh"
+#include "debug/ApplyUpdates.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
@@ -83,16 +84,14 @@ CoalesceEngine::recvReadAddr(Addr addr)
             "to responseQueue. responseQueue.size = %d.\n",
             __func__, addr, block_index, wl_offset, responseQueue.size(),
             cacheBlocks[block_index].items[wl_offset].to_string());
-
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
-
         stats.readHits++;
-        stats.numVertexReads++;
 
         assert(!responseQueue.empty());
         if (!nextRespondEvent.scheduled()) {
             schedule(nextRespondEvent, nextCycle());
         }
+        stats.numVertexReads++;
         return true;
     } else {
         // miss
@@ -105,6 +104,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 // Out of MSHR entries
                 DPRINTF(MPU, "%s: Out of MSHR entries. "
                             "Rejecting request.\n", __func__);
+                stats.readRejections++;
                 return false;
             } else {
                 DPRINTF(MPU, "%s: MSHR entries available.\n", __func__);
@@ -117,12 +117,15 @@ CoalesceEngine::recvReadAddr(Addr addr)
                         DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
                                     "Rejecting request.\n",
                                     __func__, block_index);
+                        stats.readRejections++;
                         return false;
                     }
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
                     DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
                                 "line[%d]", __func__, addr, block_index);
+                    stats.readMisses++;
+                    stats.numVertexReads++;
                     return true;
                 } else {
                     assert(!cacheBlocks[block_index].valid);
@@ -137,6 +140,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     if (memReqQueueFull()) {
                         DPRINTF(MPU, "%s: No space in outstandingMemReqQueue. "
                                     "Rejecting  request.\n", __func__);
+                        stats.readRejections++;
                         return false;
                     }
                     cacheBlocks[block_index].addr = aligned_addr;
@@ -158,7 +162,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     enqueueMemReq(pkt);
                     DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue.\n",
                                                                     __func__);
-                    stats.numVertexBlockReads++;
+                    stats.readMisses++;
+                    stats.numVertexReads++;
                     return true;
                 }
             }
@@ -169,6 +174,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
                             "Rejecting request.\n",
                             __func__, block_index);
+                stats.readRejections++;
                 return false;
             }
             if ((!cacheBlocks[block_index].hasConflict) &&
@@ -178,9 +184,17 @@ CoalesceEngine::recvReadAddr(Addr addr)
                             cacheBlocks[block_index].addr);
                 cacheBlocks[block_index].hasConflict = true;
             }
+
+            if (aligned_addr != cacheBlocks[block_index].addr) {
+                stats.readMisses++;
+            } else {
+                stats.readHits++;
+            }
+
             MSHRMap[block_index].push_back(addr);
             DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
                             "line[%d].\n", __func__, addr, block_index);
+            stats.numVertexReads++;
             return true;
         }
     }
@@ -264,7 +278,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
-            stats.numVertexReads++;
+
             servicedIndices.push_back(i);
             DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
                         "removal.\n", __func__, i, block_index);
@@ -334,7 +348,6 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     stats.numVertexWrites++;
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
-
     // TODO: Make this more general and programmable.
     // TODO: Later on check (cacheBlocks[block_index].hasConflict) to add
     // to evictQueue.
@@ -440,7 +453,6 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                             __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize);
 
                 enqueueMemReq(write_pkt);
-                stats.numVertexBlockWrites++;
                 enqueueMemReq(read_pkt);
                 DPRINTF(MPU, "%s: Added the evicting write back packet along with "
                             "its subsequent read packet (to service the conflicts)"
@@ -448,6 +460,9 @@ CoalesceEngine::processNextApplyAndCommitEvent()
 
                 for (int i = 0; i < numElementsPerLine; i++) {
                     if ((changedMask & (1 << i)) == (1 << i)) {
+                        DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", __func__,
+                        cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
+                        cacheBlocks[block_index].items[i].to_string());
                         peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
                         DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
                                     __func__, block_index, i);
@@ -467,12 +482,14 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                         "enough space in outstandingMemReqQueue for the write back"
                         " packet.\n", __func__, block_index);
                 enqueueMemReq(write_pkt);
-                stats.numVertexBlockWrites++;
                 DPRINTF(MPU, "%s: Added the write back packet to "
                             "outstandingMemReqQueue.\n", __func__);
 
                 for (int i = 0; i < numElementsPerLine; i++) {
                     if ((changedMask & (1 << i)) == (1 << i)) {
+                        DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", __func__,
+                        cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
+                        cacheBlocks[block_index].items[i].to_string());
                         peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
                         DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
                                     __func__, block_index, i);
@@ -548,16 +565,16 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
 
-    ADD_STAT(numVertexBlockReads, statistics::units::Count::get(),
-             "Number of memory blocks read for vertecies"),
-    ADD_STAT(numVertexBlockWrites, statistics::units::Count::get(),
-             "Number of memory blocks writes for vertecies"),
     ADD_STAT(numVertexReads, statistics::units::Count::get(),
              "Number of memory vertecies read from cache."),
     ADD_STAT(numVertexWrites, statistics::units::Count::get(),
              "Number of memory vertecies written to cache."),
     ADD_STAT(readHits, statistics::units::Count::get(),
-             "Number of cache hits.")
+             "Number of cache hits."),
+    ADD_STAT(readMisses, statistics::units::Count::get(),
+             "Number of cache misses."),
+    ADD_STAT(readRejections, statistics::units::Count::get(),
+             "Number of cache rejections.")
 {
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 4c4cb4567b..efd19d3e9b 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -102,11 +102,11 @@ class CoalesceEngine : public BaseMemEngine
 
       CoalesceEngine &coalesce;
 
-      statistics::Scalar numVertexBlockReads;
-      statistics::Scalar numVertexBlockWrites;
       statistics::Scalar numVertexReads;
       statistics::Scalar numVertexWrites;
       statistics::Scalar readHits;
+      statistics::Scalar readMisses;
+      statistics::Scalar readRejections;
     };
 
     CoalesceStats stats;

From 4e169aa65eb3e7e1302c66c4031695515d613fff Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 15 Apr 2022 15:21:34 -0700
Subject: [PATCH 092/247] Fixing memory atom size issue.

---
 configs/accl/sega.py                   | 2 +-
 src/accl/graph/sega/coalesce_engine.cc | 2 +-
 src/accl/graph/sega/push_engine.cc     | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 0532aa2153..61df2cc2ef 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,7 +9,7 @@ def __init__(self, base_edge_addr):
                                     attached_memory_atom_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
-                                    attached_memory_atom_size=64)
+                                    attached_memory_atom_size=32)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=16,
                                 on_the_fly_update_map_size=8)
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 36a7ddb6d2..e54447fd09 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -251,7 +251,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             (!cacheBlocks[block_index].valid) &&    // valid is false
             (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
 
-    for (int i = 0; i < 4; i++) {
+    for (int i = 0; i < numElementsPerLine; i++) {
         cacheBlocks[block_index].items[i] = *((WorkListItem*) (
                                 data + (i * sizeof(WorkListItem))));
         DPRINTF(MPU, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 53cb428b12..195cb65dbc 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -185,6 +185,8 @@ PushEngine::respondToAlarm()
 bool
 PushEngine::handleMemResp(PacketPtr pkt)
 {
+    // TODO: in case we need to edit edges, get rid of second statement.
+    assert(pkt->isResponse() && (!pkt->isWrite()));
     memRespQueue.push_back(pkt);
 
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {

From 7f52d64d0433af8ec9727ef6e6d18c297e039f8e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 17 Apr 2022 13:34:12 -0700
Subject: [PATCH 093/247] Removing dead code.

---
 configs/accl/sega.py               | 4 ++--
 src/accl/graph/sega/push_engine.cc | 5 -----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 61df2cc2ef..450f158f93 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -64,9 +64,9 @@ def __init__(self):
         self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="graphs/epinions/graph_binaries/vertices",
+            vertex_binary="graphs/test-graph/graph_binaries/vertices_0",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="graphs/epinions/graph_binaries/edgelist_0")
+            edge_binary="graphs/test-graph/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 195cb65dbc..716daf92e8 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -58,11 +58,6 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 void
 PushEngine::startup()
 {
-    uint8_t* first_update_data = new uint8_t [4];
-    uint32_t* tempPtr = (uint32_t*) first_update_data;
-    *tempPtr = 0;
-
-    // PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
     PacketPtr first_update = createUpdatePacket<uint32_t>(0, (uint32_t) 0);
 
     if (!reqPort.blocked()) {

From 2ca8a986a07d819484f5bc40d18101481d6cdf40 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 19 Apr 2022 12:03:25 -0700
Subject: [PATCH 094/247] [WIP] added the central control unit.

It has error about the crossbar
---
 configs/accl/sega.py                       |  10 +-
 src/accl/graph/sega/CenteralController.py  |  39 +++++++
 src/accl/graph/sega/SConscript             |   2 +
 src/accl/graph/sega/centeral_controller.cc | 123 +++++++++++++++++++++
 src/accl/graph/sega/centeral_controller.hh |  84 ++++++++++++++
 src/accl/graph/sega/push_engine.cc         |  10 --
 src/accl/graph/sega/push_engine.hh         |   2 -
 src/accl/graph/sega/wl_engine.cc           |   6 +
 src/accl/graph/sega/wl_engine.hh           |   2 +
 9 files changed, 263 insertions(+), 15 deletions(-)
 create mode 100644 src/accl/graph/sega/CenteralController.py
 create mode 100644 src/accl/graph/sega/centeral_controller.cc
 create mode 100644 src/accl/graph/sega/centeral_controller.hh

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 450f158f93..c4288c92d3 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -61,14 +61,18 @@ def __init__(self):
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
 
+        self.ctrl = CenteralController(addr=0, value=0)
         self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="graphs/test-graph/graph_binaries/vertices_0",
+            vertex_binary="graphs/test/vertices_0",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="graphs/test-graph/graph_binaries/edgelist_0")
+            edge_binary="graphs/test/edgelist_0")
+        self.interconnect = SystemXBar()
 
-        self.mpu.setReqPort(self.mpu.getRespPort())
+        self.ctrl.req_port = self.interconnect.cpu_side_ports
+        self.mpu.setReqPort(self.interconnect.cpu_side_ports)
+        self.mpu.setRespPort(self.interconnect.mem_side_ports)
         self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
         self.mpu.setEdgeMemPort(self.mem_ctrl.getEdgePort())
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
new file mode 100644
index 0000000000..7b00f8b12d
--- /dev/null
+++ b/src/accl/graph/sega/CenteralController.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class CenteralController(ClockedObject):
+    type = 'CenteralController'
+    cxx_header = "accl/graph/sega/centeral_controller.hh"
+    cxx_class = 'gem5::CenteralController'
+
+    req_port  = RequestPort("Port to send updates to the outside")
+    addr = Param.Addr("")
+    value = Param.Int(0, "")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 19d702c49a..c8810bbdb2 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -27,10 +27,12 @@
 
 Import('*')
 
+SimObject('CenteralController.py')
 SimObject('CoalesceEngine.py')
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
+Source('centeral_controller.cc')
 Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
new file mode 100644
index 0000000000..daa2d9b390
--- /dev/null
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/centeral_controller.hh"
+
+#include "mem/packet_access.hh"
+
+namespace gem5
+{
+
+CenteralController::CenteralController
+                    (const CenteralControllerParams &params):
+    ClockedObject(params),
+    reqPort(name() + ".req_port", this),
+    addr(params.addr),
+    value(params.value)
+{}
+
+Port&
+CenteralController::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "req_port") {
+        return reqPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+void
+CenteralController::startup()
+{
+    PacketPtr first_update = 
+                createUpdatePacket<uint32_t>(addr, value);
+
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(first_update);
+    }
+}
+
+template<typename T> PacketPtr
+CenteralController::createUpdatePacket(Addr addr, T value)
+{
+    RequestPtr req = std::make_shared<Request>(
+                addr, sizeof(T), addr, value);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) value) << 2);
+
+    // FIXME: MemCmd::UpdateWL
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+
+    pkt->allocate();
+    // pkt->setData(data);
+    pkt->setLE<T>(value);
+
+    return pkt;
+}
+
+// AddrRangeList
+// CenteralController::ReqPort::getAddrRanges() const
+// {
+//     AddrRangeList ret;
+//     ret.clear();
+//     return ret;
+// }
+
+void
+CenteralController::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+CenteralController::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+CenteralController::ReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!_blocked) {
+        blockedPacket = nullptr;
+    }
+}
+
+}
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
new file mode 100644
index 0000000000..0e1bb6ac80
--- /dev/null
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
+#define __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
+
+#include "accl/graph/base/data_structs.hh"
+#include "params/CenteralController.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class CenteralController : public ClockedObject
+{
+  private:
+    class ReqPort : public RequestPort
+    {
+      private:
+        CenteralController* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        ReqPort(const std::string& name, CenteralController* owner) :
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+        // virtual AddrRangeList getAddrRanges() const;
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    ReqPort reqPort;
+
+    Addr addr;
+    uint32_t value;
+
+    template<typename T> PacketPtr 
+                              createUpdatePacket(Addr addr, T value);
+
+    virtual void startup();
+
+  public:
+    PARAMS(CenteralController);
+    CenteralController(const CenteralControllerParams &params);
+
+    Port& getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 716daf92e8..ddfc2edef8 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -55,16 +55,6 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
-void
-PushEngine::startup()
-{
-    PacketPtr first_update = createUpdatePacket<uint32_t>(0, (uint32_t) 0);
-
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(first_update);
-    }
-}
-
 void
 PushEngine::ReqPort::sendPacket(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 5e8b079d88..ce9045e91a 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -112,8 +112,6 @@ class PushEngine : public BaseMemEngine
     // always be limited by the b/w of the memory.
     std::deque<PacketPtr> memRespQueue;
 
-    virtual void startup();
-
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     EventFunctionWrapper nextAddrGenEvent;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index ad9e93ba60..40fca42d26 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -58,6 +58,12 @@ WLEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+WLEngine::init()
+{
+    respPort.sendRangeChange();
+}
+
 AddrRangeList
 WLEngine::RespPort::getAddrRanges() const
 {
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 12df93ee79..2698ce3ea8 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -65,6 +65,8 @@ class WLEngine : public BaseReduceEngine
         virtual void recvRespRetry();
     };
 
+    virtual void init();
+
     RespPort respPort;
 
     bool blockedByCoalescer;

From a95da7b0dc83e976b444f5304e818ffe96adf90e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 22 Apr 2022 11:44:24 -0700
Subject: [PATCH 095/247] Adding UpdateWL as a MemCmd and fixing code.

---
 configs/accl/sega.py                       |   5 +-
 src/accl/graph/TODO.md                     |   5 +
 src/accl/graph/base/data_structs.hh        |   3 +
 src/accl/graph/sega/centeral_controller.cc |  14 +-
 src/accl/graph/sega/coalesce_engine.cc     | 195 +++++++++------------
 src/accl/graph/sega/push_engine.cc         |   2 +-
 src/accl/graph/sega/wl_engine.cc           |  31 +---
 src/mem/packet.cc                          |  40 +----
 src/mem/packet.hh                          |   4 +-
 9 files changed, 105 insertions(+), 194 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index c4288c92d3..aa3675d847 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -65,14 +65,15 @@ def __init__(self):
         self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="graphs/test/vertices_0",
+            vertex_binary="graphs/epinions/graph_binaries/vertices_0",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="graphs/test/edgelist_0")
+            edge_binary="graphs/epinions/graph_binaries/edgelist_0")
         self.interconnect = SystemXBar()
 
         self.ctrl.req_port = self.interconnect.cpu_side_ports
         self.mpu.setReqPort(self.interconnect.cpu_side_ports)
         self.mpu.setRespPort(self.interconnect.mem_side_ports)
+
         self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
         self.mpu.setEdgeMemPort(self.mem_ctrl.getEdgePort())
 
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index 1cec4dc6f9..f5690a3faa 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -1,3 +1,8 @@
 # TODO Items
+
 * implement all the communications between simobjects as req/retry.
 * get rid of maps with RequestPtr as keys
+* add UpdateWL as a MemCmd
+* Replace std::floor with roundDown from intmath.hh in src
+* We might need to revisit the fact that we could insert something to a queue on
+    the same cycle that another event is consuming something from the queue.
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 409245eeaa..7535d4bbac 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -30,6 +30,7 @@
 #define __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
 
 #include "base/cprintf.hh"
+#include "base/intmath.hh"
 
 namespace gem5
 {
@@ -81,6 +82,8 @@ struct __attribute__ ((packed)) Edge
     {}
 };
 
+static_assert(isPowerOf2(sizeof(WorkListItem)));
+
 }
 
 #endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index daa2d9b390..41ebeb9cd6 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -54,8 +54,7 @@ CenteralController::getPort(const std::string &if_name, PortID idx)
 void
 CenteralController::startup()
 {
-    PacketPtr first_update = 
-                createUpdatePacket<uint32_t>(addr, value);
+    PacketPtr first_update = createUpdatePacket<uint32_t>(addr, value);
 
     if (!reqPort.blocked()) {
         reqPort.sendPacket(first_update);
@@ -71,8 +70,7 @@ CenteralController::createUpdatePacket(Addr addr, T value)
     // bits
     req->setPC(((Addr) value) << 2);
 
-    // FIXME: MemCmd::UpdateWL
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
 
     pkt->allocate();
     // pkt->setData(data);
@@ -81,14 +79,6 @@ CenteralController::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
-// AddrRangeList
-// CenteralController::ReqPort::getAddrRanges() const
-// {
-//     AddrRangeList ret;
-//     ret.clear();
-//     return ret;
-// }
-
 void
 CenteralController::ReqPort::sendPacket(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index e54447fd09..e6503ea01d 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/coalesce_engine.hh"
 
 #include "accl/graph/sega/wl_engine.hh"
+#include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
@@ -47,6 +48,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
     stats(*this)
 {
+    assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
     cacheBlocks = new Block [numLines];
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
@@ -72,18 +74,25 @@ CoalesceEngine::recvReadAddr(Addr addr)
     DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
     Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
+    assert(aligned_addr % peerMemoryAtomSize == 0);
     int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    assert(block_index < numLines);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    assert(wl_offset < numElementsPerLine);
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
+        // TODO: Add a hit latency as a param for this object.
+        // Can't just schedule the nextRespondEvent for latency cycles in
+        // the future.
         responseQueue.push_back(std::make_tuple(addr,
                     cacheBlocks[block_index].items[wl_offset]));
         DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
             "to responseQueue. responseQueue.size = %d.\n",
             __func__, addr, block_index, wl_offset, responseQueue.size(),
             cacheBlocks[block_index].items[wl_offset].to_string());
+        // TODO: Add a stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
         stats.readHits++;
 
@@ -104,6 +113,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 // Out of MSHR entries
                 DPRINTF(MPU, "%s: Out of MSHR entries. "
                             "Rejecting request.\n", __func__);
+                // TODO: Break out read rejections into more than one stat
+                // based on the cause of the rejection
                 stats.readRejections++;
                 return false;
             } else {
@@ -200,6 +211,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
     }
 }
 
+// TODO: For loop to empty the entire responseQueue.
 void
 CoalesceEngine::processNextRespondEvent()
 {
@@ -241,8 +253,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     }
 
     Addr addr = pkt->getAddr();
-    uint8_t* data = pkt->getPtr<uint8_t>();
-
     int block_index = (addr / peerMemoryAtomSize) % numLines;
 
     DPRINTF(MPU, "%s: Received a read resposne for Addr: %lu.\n",
@@ -250,17 +260,17 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
             (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
+    pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
+                                                peerMemoryAtomSize);
 
     for (int i = 0; i < numElementsPerLine; i++) {
-        cacheBlocks[block_index].items[i] = *((WorkListItem*) (
-                                data + (i * sizeof(WorkListItem))));
         DPRINTF(MPU, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
                 block_index, i, cacheBlocks[block_index].items[i].to_string());
     }
     cacheBlocks[block_index].valid = true;
     delete pkt;
 
-    int bias = 0;
+    // FIXME: Get rid of servicedIndices (maybe use an iterator)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHRMap[block_index].size(); i++) {
         Addr miss_addr = MSHRMap[block_index][i];
@@ -271,20 +281,26 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
                         "be serviced with the received packet.\n",
                         __func__, miss_addr, block_index);
+            // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
             DPRINTF(MPU, "%s: Pushed cache line[%d][%d] to "
                     "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
+            // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+            // End of the said block
 
             servicedIndices.push_back(i);
             DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
                         "removal.\n", __func__, i, block_index);
         }
     }
+
     // TODO: We Can use taken instead of this
+    // TODO: Change the MSHRMap from map<Addr, vector> to map<Addr, list>
+    int bias = 0;
     for (int i = 0; i < servicedIndices.size(); i++) {
         Addr print_addr = MSHRMap[block_index][i - bias];
         MSHRMap[block_index].erase(MSHRMap[block_index].begin() +
@@ -298,8 +314,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         MSHRMap.erase(block_index);
         cacheBlocks[block_index].hasConflict = false;
     } else {
-        // TODO: I think this is unnecessary.
-        cacheBlocks[block_index].hasConflict = true;
+        assert(cacheBlocks[block_index].hasConflict);
     }
 
     if ((!nextRespondEvent.scheduled()) &&
@@ -341,11 +356,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) {
         cacheBlocks[block_index].hasChange = true;
+        stats.numVertexWrites++;
     }
 
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
-    stats.numVertexWrites++;
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
     // TODO: Make this more general and programmable.
@@ -380,8 +395,9 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 __func__, block_index);
     DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and "
                 "then commited.\n", __func__, block_index);
+
     if (cacheBlocks[block_index].takenMask == 0) {
-        if ((cacheBlocks[block_index].hasChange)&&
+        if ((cacheBlocks[block_index].hasChange) &&
             (cacheBlocks[block_index].hasConflict) &&
             (memReqQueueHasSpace(2))) {
             DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
@@ -420,6 +436,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                         cacheBlocks[block_index].items[i].to_string());
             if (old_prop != cacheBlocks[block_index].items[i].prop) {
                 changedMask |= (1 << i);
+                // TODO: Add a stat to count the number of changed props.
                 DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
                             __func__, block_index, i);
             }
@@ -434,117 +451,65 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 (uint8_t*) cacheBlocks[block_index].items);
             DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = %d.\n",
                         __func__, write_pkt->getAddr(), peerMemoryAtomSize);
-            if (cacheBlocks[block_index].hasConflict) {
-                DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
-                            "enough space in outstandingMemReqQueue for the write "
-                            "back packet and its subsequent read packet.\n",
-                            __func__, block_index);
-                Addr miss_addr = MSHRMap[block_index][0];
-                DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
-                            " Addr: %lu.\n", __func__, block_index, miss_addr);
-
-                Addr aligned_miss_addr =
-                    std::floor(miss_addr / peerMemoryAtomSize) *
-                    peerMemoryAtomSize;
-                PacketPtr read_pkt = createReadPacket(
-                    aligned_miss_addr, peerMemoryAtomSize);
-                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                            " req addr (aligned_addr) = %lu, size = %d.\n",
-                            __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize);
-
-                enqueueMemReq(write_pkt);
-                enqueueMemReq(read_pkt);
-                DPRINTF(MPU, "%s: Added the evicting write back packet along with "
-                            "its subsequent read packet (to service the conflicts)"
-                            " to outstandingMemReqQueue.\n" , __func__);
-
-                for (int i = 0; i < numElementsPerLine; i++) {
-                    if ((changedMask & (1 << i)) == (1 << i)) {
-                        DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", __func__,
-                        cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
-                        cacheBlocks[block_index].items[i].to_string());
-                        peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
-                        DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                    __func__, block_index, i);
-                    }
-                }
-
-                cacheBlocks[block_index].addr = aligned_miss_addr;
-                cacheBlocks[block_index].takenMask = 0;
-                cacheBlocks[block_index].allocated = true;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = true;
-                cacheBlocks[block_index].hasChange = false;
-                DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                            " = %u.\n", __func__, evictQueue.size());
-            } else {
-                DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
-                        "enough space in outstandingMemReqQueue for the write back"
-                        " packet.\n", __func__, block_index);
-                enqueueMemReq(write_pkt);
-                DPRINTF(MPU, "%s: Added the write back packet to "
-                            "outstandingMemReqQueue.\n", __func__);
-
-                for (int i = 0; i < numElementsPerLine; i++) {
-                    if ((changedMask & (1 << i)) == (1 << i)) {
-                        DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", __func__,
-                        cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
-                        cacheBlocks[block_index].items[i].to_string());
-                        peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
-                        DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                    __func__, block_index, i);
-                    }
+            enqueueMemReq(write_pkt);
+            DPRINTF(MPU, "%s: Added the evicting write back packet to "
+                        "outstandingMemReqQueue.\n" , __func__);
+
+            for (int i = 0; i < numElementsPerLine; i++) {
+                if ((changedMask & (1 << i)) == (1 << i)) {
+                    DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n",
+                    __func__, cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
+                    cacheBlocks[block_index].items[i].to_string());
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                                        __func__, block_index, i);
                 }
-
-                // Since allocated is false, does not matter what the address is.
-
-                cacheBlocks[block_index].takenMask = 0;
-                cacheBlocks[block_index].allocated = false;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].hasChange = false;
-                DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                            " = %u.\n", __func__, evictQueue.size());
             }
-        } else {
-            DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write "
-                        "backs are necessary.\n", __func__, block_index);
-            if (cacheBlocks[block_index].hasConflict) {
-                DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
-                            "enough space in outstandingMemReqQueue for the write "
-                            "back packet and its subsequent read packet.\n",
-                            __func__, block_index);
-                Addr miss_addr = MSHRMap[block_index][0];
-                DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
-                            " Addr: %lu.\n", __func__, block_index, miss_addr);
+        }
 
-                Addr aligned_miss_addr =
-                    std::floor(miss_addr / peerMemoryAtomSize) *
+        if (cacheBlocks[block_index].hasConflict) {
+            assert(!MSHRMap[block_index].empty());
+            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
+                        "enough space in outstandingMemReqQueue for a read "
+                        "packet.\n", __func__, block_index);
+            Addr miss_addr = MSHRMap[block_index][0];
+            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
+                        " Addr: %lu.\n", __func__, block_index, miss_addr);
+
+            Addr aligned_miss_addr =
+                std::floor(miss_addr / peerMemoryAtomSize) *
                     peerMemoryAtomSize;
-                PacketPtr read_pkt = createReadPacket(
-                        aligned_miss_addr, peerMemoryAtomSize);
-                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                            " req addr (aligned_addr) = %lu, size = %d.\n",
-                            __func__, miss_addr, aligned_miss_addr);
-                enqueueMemReq(read_pkt);
-
-                cacheBlocks[block_index].addr = aligned_miss_addr;
-                cacheBlocks[block_index].takenMask = 0;
-                cacheBlocks[block_index].allocated = true;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = true;
-                cacheBlocks[block_index].hasChange = false;
-            } else {
-                DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just "
-                            "deallocating the line.\n", __func__, block_index);
-
-                cacheBlocks[block_index].takenMask = 0;
-                cacheBlocks[block_index].allocated = false;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].hasChange = false;
-            }
+            PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
+                                                    peerMemoryAtomSize);
+            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                        " req addr (aligned_addr) = %lu, size = %d.\n",
+                        __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize);
+            enqueueMemReq(read_pkt);
+            DPRINTF(MPU, "%s: Added the evicting write back packet along with "
+                        "its subsequent read packet (to service the conflicts)"
+                        " to outstandingMemReqQueue.\n" , __func__);
+
+            cacheBlocks[block_index].addr = aligned_miss_addr;
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = true;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = true;
+            cacheBlocks[block_index].hasChange = false;
+        } else {
+            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
+                    "enough space in outstandingMemReqQueue for the write back"
+                    " packet.\n", __func__, block_index);
+            DPRINTF(MPU, "%s: Added the write back packet to "
+                        "outstandingMemReqQueue.\n", __func__);
+
+            // Since allocated is false, does not matter what the address is.
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = false;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = false;
+            cacheBlocks[block_index].hasChange = false;
         }
+
     } else {
         DPRINTF(MPU, "%s: cache line[%d] has been read since being scheduled "
                     "for eviction. Therefore, ignoring the evict schedule.\n",
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index ddfc2edef8..e822b7168b 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -238,7 +238,7 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     req->setPC(((Addr) _requestorId) << 2);
 
     // FIXME: MemCmd::UpdateWL
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
 
     pkt->allocate();
     // pkt->setData(data);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 40fca42d26..148f5de5be 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -121,6 +121,8 @@ WLEngine::getAddrRanges() const
     return coalesceEngine->getAddrRanges();
 }
 
+// TODO: Parameterize the number of pops WLEngine can do at a time.
+// TODO: Add a histogram stats of the size of the updateQueue. Sample here.
 void
 WLEngine::processNextReadEvent()
 {
@@ -144,9 +146,7 @@ WLEngine::processNextReadEvent()
                 DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
                             ". updateQueue.size = %u.\n",
                             __func__, updateQueue.size());
-                if (updateQueue.size() == updateQueueSize - 1) {
-                    respPort.checkRetryReq();
-                }
+                respPort.checkRetryReq();
             }
         }
     } else {
@@ -164,9 +164,7 @@ WLEngine::processNextReadEvent()
         DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
                                         ". updateQueue.size = %u.\n",
                                         __func__, updateQueue.size());
-        if (updateQueue.size() == updateQueueSize - 1) {
-            respPort.checkRetryReq();
-        }
+        respPort.checkRetryReq();
     }
 
     // TODO: Only schedule nextReadEvent only when it has to be scheduled
@@ -194,12 +192,9 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 void
 WLEngine::processNextReduceEvent()
 {
-    std::unordered_map<Addr, WorkListItem>::iterator it =
-                    addrWorkListMap.begin();
-
-    std::vector<Addr> servicedAddresses;
-    while (it != addrWorkListMap.end()) {
-        Addr addr = it->first;
+    for (auto &it : addrWorkListMap) {
+        Addr addr = it.first;
+        assert(onTheFlyUpdateMap.find(addr) != onTheFlyUpdateMap.end());
         uint32_t update_value = onTheFlyUpdateMap[addr];
         DPRINTF(MPU, "%s: Reducing between onTheFlyUpdateMap and "
                     "addrWorkListMap values. onTheFlyUpdateMap[%lu] = %u, "
@@ -214,17 +209,9 @@ WLEngine::processNextReduceEvent()
         stats.numReduce++;
 
         coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]);
-        servicedAddresses.push_back(addr);
-        DPRINTF(MPU, "%s: Added addr: %lu to servicedAdresses.\n",
-                    __func__, addr);
-        it++;
-    }
-
-    addrWorkListMap.clear();
-    for (int i = 0; i < servicedAddresses.size(); i++) {
-        onTheFlyUpdateMap.erase(servicedAddresses[i]);
+        onTheFlyUpdateMap.erase(addr);
         DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
-                    __func__, servicedAddresses[i]);
+                    __func__, addr);
     }
 }
 
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index da45246e49..daf9d18e88 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -237,6 +237,7 @@ MemCmd::commandInfo[] =
     { {IsRead, IsResponse}, InvalidCmd, "HTMReqResp" },
     { {IsRead, IsRequest}, InvalidCmd, "HTMAbort" },
     { {IsRequest}, InvalidCmd, "TlbiExtSync" },
+    { {IsRequest, HasData}, InvalidCmd, "UpdateWL"}
 };
 
 AddrRange
@@ -532,43 +533,4 @@ Packet::getHtmTransactionUid() const
     return htmTransactionUid;
 }
 
-std::string
-Packet::printData()
-{
-    char ret[1024];
-    if (isWrite()) {
-        uint8_t* data = getPtr<uint8_t>();
-        std::sprintf(ret,"\n"
-                        "V[%lu] temp_prop: %u, prop: %u, "
-                        "degree: %u, edgeIndex: %u.\n"
-                        "V[%lu] temp_prop: %u, prop: %u, "
-                        "degree: %u, edgeIndex: %u.\n"
-                        "V[%lu] temp_prop: %u, prop: %u, "
-                        "degree: %u, edgeIndex: %u.\n"
-                        "V[%lu] temp_prop: %u, prop: %u, "
-                        "degree: %u, edgeIndex: %u.\n",
-                        getAddr(),
-                        *((uint32_t*) data),
-                        *((uint32_t*) (data + 4)),
-                        *((uint32_t*) (data + 8)),
-                        *((uint32_t*) (data + 12)),
-                        getAddr() + 16,
-                        *((uint32_t*) (data + 16)),
-                        *((uint32_t*) (data + 20)),
-                        *((uint32_t*) (data + 24)),
-                        *((uint32_t*) (data + 28)),
-                        getAddr() + 32,
-                        *((uint32_t*) (data + 32)),
-                        *((uint32_t*) (data + 36)),
-                        *((uint32_t*) (data + 40)),
-                        *((uint32_t*) (data + 44)),
-                        getAddr() + 48,
-                        *((uint32_t*) (data + 48)),
-                        *((uint32_t*) (data + 52)),
-                        *((uint32_t*) (data + 56)),
-                        *((uint32_t*) (data + 60)));
-    }
-    return ret;
-}
-
 } // namespace gem5
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 8803eacced..5332ee32a2 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -149,7 +149,7 @@ class MemCmd
         // Tlb shootdown
         TlbiExtSync,
         // MPU Accelerator
-        // UpdateWL,
+        UpdateWL,
         NUM_MEM_CMDS
     };
 
@@ -1374,8 +1374,6 @@ class Packet : public Printable
     template <typename T>
     void setRaw(T v);
 
-    std::string printData();
-
   public:
     /**
      * Check a functional request against a memory value stored in

From e4b665c796dbe348a511585c3eb2c1b3d87630b4 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 24 Apr 2022 20:28:25 -0700
Subject: [PATCH 096/247] A little bit of debugging and updating config script.

---
 configs/accl/sega.py             | 138 +++++++++++++++++++++++--------
 src/accl/graph/TODO.md           |   5 +-
 src/accl/graph/sega/wl_engine.cc |   1 +
 src/accl/graph/sega/wl_engine.hh |   2 +-
 4 files changed, 105 insertions(+), 41 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index aa3675d847..9dd8c0f358 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -1,5 +1,9 @@
 import m5
+import argparse
+
+from math import log
 from m5.objects import *
+from m5.util.convert import toMemorySize
 
 class MPU(SubSystem):
     def __init__(self, base_edge_addr):
@@ -35,53 +39,115 @@ def setEdgeMemPort(self, port):
         self.push_engine.mem_port = port
 
 class MPUMemory(SubSystem):
-    def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
+    def __init__(self,
+                    num_channels: int,
+                    cache_line_size: int,
+                    vertex_memory_size: str,
+                    edge_memory_size: str,
+                    graph_path: str):
         super(MPUMemory, self).__init__()
-        self.vertex_mem_ctrl = SimpleMemory(
-            range=vertex_range, bandwidth="19.2GB/s",
-            latency="30ns", image_file=vertex_binary)
-        self.edge_mem_ctrl = SimpleMemory(
-            range=edge_range, bandwidth="19.2GB/s",
-            latency="30ns", image_file=edge_binary)
-
-    def getVertexPort(self):
-        return self.vertex_mem_ctrl.port
-    def setVertexPort(self, port):
-        self.vertex_mem_ctrl.port = port
-
-    def getEdgePort(self):
-        return self.edge_mem_ctrl.port
-    def setEdgePort(self, port):
-        self.edge_mem_ctrl.port = port
+
+        self._vertex_ranges = self._interleave_addresses(
+                                AddrRange(start=0, size=vertex_memory_size),\
+                                num_channels,\
+                                cache_line_size)
+
+        self._edge_chunk_size = int(\
+                                toMemorySize(edge_memory_size)/num_channels)
+        self._edge_ranges = [AddrRange(\
+                            start=toMemorySize(vertex_memory_size)+\
+                            self._edge_chunk_size*i,\
+                            size=self._edge_chunk_size)\
+                            for i in range(num_channels)]
+
+        vertex_mem_ctrl = []
+        edge_mem_ctrl = []
+        for i in range(num_channels):
+            vertex_mem_ctrl.append(
+                SimpleMemory(range=self._vertex_ranges[i],
+                            bandwidth="19.2GB/s",
+                            latency="30ns",
+                            image_file=f"{graph_path}/vertices_{i}")
+            )
+            edge_mem_ctrl.append(
+                SimpleMemory(range=self._edge_ranges[i],
+                            bandwidth="19.2GB/s",
+                            latency="30ns",
+                            image_file=f"{graph_path}/edgelist_{i}")
+            )
+        self.vertex_mem_ctrl = vertex_mem_ctrl
+        self.edge_mem_ctrl = edge_mem_ctrl
+
+    def _interleave_addresses(self,
+                            plain_range,
+                            num_channels,
+                            cache_line_size):
+        intlv_low_bit = log(cache_line_size, 2)
+        intlv_bits = log(num_channels, 2)
+        ret = []
+        for i in range(num_channels):
+            ret.append(AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i))
+        return ret
+
+    def getVertexPort(self, i):
+        return self.vertex_mem_ctrl[i].port
+    def setVertexPort(self, port, i):
+        self.vertex_mem_ctrl[i].port = port
+
+    def getEdgeBaseAddr(self, i):
+        return self._edge_ranges[i].start
+    def getEdgePort(self, i):
+        return self.edge_mem_ctrl[i].port
+    def setEdgePort(self, port, i):
+        self.edge_mem_ctrl[i].port = port
 
 class SEGA(System):
-    def __init__(self):
+    def __init__(self, num_mpus, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
 
-        self.ctrl = CenteralController(addr=0, value=0)
-        self.mpu = MPU(base_edge_addr=0x80000000)
-        self.mem_ctrl = MPUMemory(
-            vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="graphs/epinions/graph_binaries/vertices_0",
-            edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="graphs/epinions/graph_binaries/edgelist_0")
-        self.interconnect = SystemXBar()
+        self.interconnect = NoncoherentXBar(frontend_latency=1,
+                                            forward_latency=1,
+                                            response_latency=1,
+                                            width=64)
 
+        self.ctrl = CenteralController(addr=0, value=0)
         self.ctrl.req_port = self.interconnect.cpu_side_ports
-        self.mpu.setReqPort(self.interconnect.cpu_side_ports)
-        self.mpu.setRespPort(self.interconnect.mem_side_ports)
 
-        self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
-        self.mpu.setEdgeMemPort(self.mem_ctrl.getEdgePort())
+        self.mem_ctrl = MPUMemory(num_mpus, 32, "2GiB", "2GiB", graph_path)
+
+        mpus = []
+        for i in range(num_mpus):
+            mpus.append(MPU(base_edge_addr=self.mem_ctrl.getEdgeBaseAddr(i)))
+            mpus[i].setReqPort(self.interconnect.cpu_side_ports)
+            mpus[i].setRespPort(self.interconnect.mem_side_ports)
+            mpus[i].setVertexMemPort(self.mem_ctrl.getVertexPort(i))
+            mpus[i].setEdgeMemPort(self.mem_ctrl.getEdgePort(i))
+        self.mpu = mpus
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_mpus", type=int)
+    argparser.add_argument("graph_path", type=str)
+    args = argparser.parse_args()
+    return args.num_mpus, args.graph_path
 
-system = SEGA()
-root = Root(full_system = False, system = system)
+if __name__ == "__m5_main__":
+    num_mpus, graph_path = get_inputs()
+    print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
+    system = SEGA(num_mpus, graph_path)
+    root = Root(full_system = False, system = system)
 
-m5.instantiate()
+    m5.instantiate()
 
-exit_event = m5.simulate()
-print("Simulation finished!")
-exit()
+    exit_event = m5.simulate()
+    print("Simulation finished!")
+    exit()
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index f5690a3faa..29b5a2939e 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -1,8 +1,5 @@
 # TODO Items
 
-* implement all the communications between simobjects as req/retry.
-* get rid of maps with RequestPtr as keys
-* add UpdateWL as a MemCmd
 * Replace std::floor with roundDown from intmath.hh in src
 * We might need to revisit the fact that we could insert something to a queue on
-    the same cycle that another event is consuming something from the queue.
+    the same cycle that another event is consuming something from the queue.
\ No newline at end of file
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 148f5de5be..e949cbcf5b 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -213,6 +213,7 @@ WLEngine::processNextReduceEvent()
         DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
                     __func__, addr);
     }
+    addrWorkListMap.clear();
 }
 
 bool
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 2698ce3ea8..597fdb2b1e 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -52,7 +52,7 @@ class WLEngine : public BaseReduceEngine
 
       public:
         RespPort(const std::string& name, WLEngine* owner):
-          ResponsePort(name, owner), owner(owner)
+          ResponsePort(name, owner), owner(owner), needSendRetryReq(false)
         {}
         virtual AddrRangeList getAddrRanges() const;
 

From c8b7b26fcc071883bb70cbaf31b936249a4b20be Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 25 Apr 2022 16:56:04 -0700
Subject: [PATCH 097/247] Adding initState to CenteralController.

---
 configs/accl/sega.py                       | 23 ++++++++++------
 src/accl/graph/sega/CenteralController.py  |  3 ++
 src/accl/graph/sega/centeral_controller.cc | 32 ++++++++++++++++++++++
 src/accl/graph/sega/centeral_controller.hh |  6 +++-
 4 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 9dd8c0f358..0907ba77de 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -66,8 +66,7 @@ def __init__(self,
             vertex_mem_ctrl.append(
                 SimpleMemory(range=self._vertex_ranges[i],
                             bandwidth="19.2GB/s",
-                            latency="30ns",
-                            image_file=f"{graph_path}/vertices_{i}")
+                            latency="30ns")
             )
             edge_mem_ctrl.append(
                 SimpleMemory(range=self._edge_ranges[i],
@@ -108,21 +107,28 @@ def setEdgePort(self, port, i):
         self.edge_mem_ctrl[i].port = port
 
 class SEGA(System):
-    def __init__(self, num_mpus, graph_path):
+    def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = vertex_cache_line_size
 
         self.interconnect = NoncoherentXBar(frontend_latency=1,
                                             forward_latency=1,
                                             response_latency=1,
                                             width=64)
 
-        self.ctrl = CenteralController(addr=0, value=0)
+        self.ctrl = CenteralController(addr=0, value=0,
+                                    image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
-        self.mem_ctrl = MPUMemory(num_mpus, 32, "2GiB", "2GiB", graph_path)
+        self.mem_ctrl = MPUMemory(
+                            num_mpus,
+                            self.cache_line_size,
+                            "2GiB",
+                            "2GiB",
+                            graph_path)
 
         mpus = []
         for i in range(num_mpus):
@@ -136,14 +142,15 @@ def __init__(self, num_mpus, graph_path):
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_mpus", type=int)
+    argparser.add_argument("vertex_cache_line_size", type=int)
     argparser.add_argument("graph_path", type=str)
     args = argparser.parse_args()
-    return args.num_mpus, args.graph_path
+    return args.num_mpus, args.vertex_cache_line_size, args.graph_path
 
 if __name__ == "__m5_main__":
-    num_mpus, graph_path = get_inputs()
+    num_mpus, vertex_cache_line_size, graph_path = get_inputs()
     print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
-    system = SEGA(num_mpus, graph_path)
+    system = SEGA(num_mpus, vertex_cache_line_size, graph_path)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 7b00f8b12d..bd2f6320a8 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -34,6 +34,9 @@ class CenteralController(ClockedObject):
     cxx_header = "accl/graph/sega/centeral_controller.hh"
     cxx_class = 'gem5::CenteralController'
 
+    system = Param.System(Parent.any, "System this Engine is a part of")
     req_port  = RequestPort("Port to send updates to the outside")
     addr = Param.Addr("")
     value = Param.Int(0, "")
+
+    image_file = Param.String("Path to the global memory image.")
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 41ebeb9cd6..3c05972224 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -28,6 +28,9 @@
 
 #include "accl/graph/sega/centeral_controller.hh"
 
+#include "base/loader/memory_image.hh"
+#include "base/loader/object_file.hh"
+#include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -36,6 +39,7 @@ namespace gem5
 CenteralController::CenteralController
                     (const CenteralControllerParams &params):
     ClockedObject(params),
+    system(params.system),
     reqPort(name() + ".req_port", this),
     addr(params.addr),
     value(params.value)
@@ -51,6 +55,26 @@ CenteralController::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+CenteralController::initState()
+{
+    ClockedObject::initState();
+
+    const auto &file = params().image_file;
+    if (file == "")
+        return;
+
+    auto *object = loader::createObjectFile(file, true);
+    fatal_if(!object, "%s: Could not load %s.", name(), file);
+
+    loader::debugSymbolTable.insert(*object->symtab().globals());
+    loader::MemoryImage image = object->buildImage();
+    PortProxy proxy([this](PacketPtr pkt) { functionalAccess(pkt); },
+                    system->cacheLineSize());
+
+    panic_if(!image.write(proxy), "%s: Unable to write image.");
+}
+
 void
 CenteralController::startup()
 {
@@ -110,4 +134,12 @@ CenteralController::ReqPort::recvReqRetry()
     }
 }
 
+void
+CenteralController::functionalAccess(PacketPtr pkt)
+{
+    DPRINTF(MPU, "%s: Functional access for pkt->addr: %lu, pkt->size: %lu.\n",
+                __func__, pkt->getAddr(), pkt->getSize());
+    reqPort.sendFunctional(pkt);
+}
+
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 0e1bb6ac80..102800de92 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -61,16 +61,20 @@ class CenteralController : public ClockedObject
         virtual void recvReqRetry();
     };
 
+    System* system;
     ReqPort reqPort;
 
     Addr addr;
     uint32_t value;
 
-    template<typename T> PacketPtr 
+    template<typename T> PacketPtr
                               createUpdatePacket(Addr addr, T value);
 
+    virtual void initState();
     virtual void startup();
 
+    void functionalAccess(PacketPtr pkt);
+
   public:
     PARAMS(CenteralController);
     CenteralController(const CenteralControllerParams &params);

From f0bf6143f964c3ddbd5197d1d77efee8fe0381e8 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 29 Apr 2022 15:28:06 -0700
Subject: [PATCH 098/247] Changing debug flag for CenteralController.

---
 src/accl/graph/sega/SConscript             | 1 +
 src/accl/graph/sega/centeral_controller.cc | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index c8810bbdb2..16fab86ede 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -38,3 +38,4 @@ Source('push_engine.cc')
 Source('wl_engine.cc')
 
 DebugFlag('ApplyUpdates')
+DebugFlag('CenteralController')
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 3c05972224..f19c93ebac 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -30,7 +30,7 @@
 
 #include "base/loader/memory_image.hh"
 #include "base/loader/object_file.hh"
-#include "debug/MPU.hh"
+#include "debug/CenteralController.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -137,7 +137,8 @@ CenteralController::ReqPort::recvReqRetry()
 void
 CenteralController::functionalAccess(PacketPtr pkt)
 {
-    DPRINTF(MPU, "%s: Functional access for pkt->addr: %lu, pkt->size: %lu.\n",
+    DPRINTF(CenteralController,
+                "%s: Functional access for pkt->addr: %lu, pkt->size: %lu.\n",
                 __func__, pkt->getAddr(), pkt->getSize());
     reqPort.sendFunctional(pkt);
 }

From 4485e3b2b981fc620daabd7470d8bc8d9adcf978 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 3 May 2022 09:33:52 -0700
Subject: [PATCH 099/247] Fixing a bug and adding new stats.

---
 configs/accl/sega.py                   |  9 ++++++---
 src/accl/graph/sega/coalesce_engine.cc |  4 +++-
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/push_engine.cc     | 19 ++++++++++++++++++-
 src/accl/graph/sega/push_engine.hh     | 13 +++++++++++++
 5 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 0907ba77de..bfdad58f72 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,13 +9,15 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=16,
+                                    push_req_queue_size=0,
                                     attached_memory_atom_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
-                                    attached_memory_atom_size=32)
+                                    attached_memory_atom_size=32,
+                                    cache_size="1MiB",
+                                    num_mshr_entry=16)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=16,
+                                update_queue_size=32,
                                 on_the_fly_update_map_size=8)
 
     def getRespPort(self):
@@ -113,6 +115,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = vertex_cache_line_size
+        self.mem_mode = "timing"
 
         self.interconnect = NoncoherentXBar(frontend_latency=1,
                                             forward_latency=1,
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index e6503ea01d..fbe593507a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -199,7 +199,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
             if (aligned_addr != cacheBlocks[block_index].addr) {
                 stats.readMisses++;
             } else {
-                stats.readHits++;
+                stats.readHitUnderMisses++;
             }
 
             MSHRMap[block_index].push_back(addr);
@@ -538,6 +538,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache hits."),
     ADD_STAT(readMisses, statistics::units::Count::get(),
              "Number of cache misses."),
+    ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
+             "Number of cache hit under misses."),
     ADD_STAT(readRejections, statistics::units::Count::get(),
              "Number of cache rejections.")
 {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index efd19d3e9b..ce019ef969 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -106,6 +106,7 @@ class CoalesceEngine : public BaseMemEngine
       statistics::Scalar numVertexWrites;
       statistics::Scalar readHits;
       statistics::Scalar readMisses;
+      statistics::Scalar readHitUnderMisses;
       statistics::Scalar readRejections;
     };
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index e822b7168b..69b9f3f23e 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -40,7 +40,8 @@ PushEngine::PushEngine(const PushEngineParams &params):
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
-    nextPushEvent([this] { processNextPushEvent(); }, name())
+    nextPushEvent([this] { processNextPushEvent(); }, name()),
+    stats(*this)
 {}
 
 Port&
@@ -207,6 +208,7 @@ PushEngine::processNextPushEvent()
 
     if (!reqPort.blocked()) {
         reqPort.sendPacket(update);
+        stats.numUpdates++;
         DPRINTF(MPU, "%s: Sent a push update to addr: %lu with value: %d.\n",
                                 __func__, curr_edge->neighbor, update_value);
         reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
@@ -247,4 +249,19 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
+PushEngine::PushStats::PushStats(PushEngine &_push)
+    : statistics::Group(&_push),
+    push(_push),
+
+    ADD_STAT(numUpdates, statistics::units::Count::get(),
+             "Number of sent updates.")
+{
+}
+
+void
+PushEngine::PushStats::regStats()
+{
+    using namespace statistics;
+}
+
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index ce9045e91a..7a6981daa0 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -120,6 +120,19 @@ class PushEngine : public BaseMemEngine
     EventFunctionWrapper nextPushEvent;
     void processNextPushEvent();
 
+    struct PushStats : public statistics::Group
+    {
+      PushStats(PushEngine &push);
+
+      void regStats() override;
+
+      PushEngine &push;
+
+      statistics::Scalar numUpdates;
+    };
+
+    PushStats stats;
+
   protected:
     virtual void respondToAlarm();
     virtual bool handleMemResp(PacketPtr pkt);

From c17fb8b04a02fdd590aa3ea5df55cedef47b1f18 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 17 May 2022 10:56:09 -0700
Subject: [PATCH 100/247] Fixing double evicts.

---
 configs/accl/sega.py                   |  6 +++---
 src/accl/graph/sega/coalesce_engine.cc | 27 ++++++++++----------------
 src/accl/graph/sega/coalesce_engine.hh |  3 ---
 3 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index bfdad58f72..b799b05dc5 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -122,7 +122,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
                                             response_latency=1,
                                             width=64)
 
-        self.ctrl = CenteralController(addr=0, value=0,
+        self.ctrl = CenteralController(addr=192, value=0,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
@@ -130,7 +130,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
                             num_mpus,
                             self.cache_line_size,
                             "2GiB",
-                            "2GiB",
+                            "14GiB",
                             graph_path)
 
         mpus = []
@@ -158,6 +158,6 @@ def get_inputs():
 
     m5.instantiate()
 
-    exit_event = m5.simulate()
+    exit_event = m5.simulate(1000000000000)
     print("Simulation finished!")
     exit()
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index fbe593507a..b41f6b1db7 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -325,22 +325,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
-PacketPtr
-CoalesceEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
-
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) _requestorId) << 2);
-
-    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
-    pkt->allocate();
-    pkt->setData(data);
-
-    return pkt;
-}
-
 void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
@@ -370,7 +354,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
-        evictQueue.push_back(block_index);
+        // TODO: Fix this hack
+        bool found = false;
+        for (auto i : evictQueue) {
+            if (i == block_index) {
+                found = true;
+            }
+        }
+        if (!found) {
+            evictQueue.push_back(block_index);
+        }
         DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
                     __func__, block_index, evictQueue.size());
     }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index ce019ef969..e86014fc25 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -85,9 +85,6 @@ class CoalesceEngine : public BaseMemEngine
 
     std::deque<int> evictQueue;
 
-    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
-    // PacketPtr createWritePacket(Addr addr, unsigned int size, WorkListItem wl);
-
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
 

From 4c8ebec475ae4473c8819f59cc3c09804613d7bc Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 18 May 2022 17:23:05 -0700
Subject: [PATCH 101/247] Fixing false dependency and deadlock issues. wip.

---
 src/accl/graph/sega/coalesce_engine.cc | 74 +++++++++++++++++++++++++-
 1 file changed, 72 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b41f6b1db7..92d82bce35 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -347,9 +347,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
+
     // TODO: Make this more general and programmable.
-    // TODO: Later on check (cacheBlocks[block_index].hasConflict) to add
-    // to evictQueue.
     if ((cacheBlocks[block_index].takenMask == 0)) {
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
@@ -359,6 +358,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
         for (auto i : evictQueue) {
             if (i == block_index) {
                 found = true;
+                break;
             }
         }
         if (!found) {
@@ -376,6 +376,76 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
 }
 
+void
+CoalesceEngine::processNextApplyEvent()
+{
+    int block_index = applyQueue.front();
+
+    if (cacheBlocks[block_index].takenMask) {
+        DPRINTF(MPU, "%s: cache line [%d] has been taken amid apply process. "
+                    "Therefore, ignoring the apply schedule.\n",
+                    __func__, block_index);
+        stats.falseApplySchedules++;
+    } else if (!cacheBlocks[block_index].hasChange) {
+        DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply "
+                    "needed. Adding the cache line to evict schedule.\n",
+                    __func__, block_index);
+        evictQueue.push_back(block_index);
+    } else {
+        for (int i = 0; i < numElementsPerLine; i++) {
+            uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
+            cacheBlocks[block_index].items[i].prop = std::min(
+                                cacheBlocks[block_index].items[i].prop,
+                                cacheBlocks[block_index].items[i].tempProp);
+            // TODO: Is this correct?
+            cacheBlocks[block_index].items[i].tempProp = cacheBlocks[block_index].items[i].prop;
+
+            if (cacheBlocks[block_index].items[i].prop != old_prop) {
+                if (peerPushEngine->recvWLItem(
+                    cacheBlocks[block_index].items[i])) {
+                    DPRINTF(MPU, "%s: Sent WorkListItem [%d] to PushEngine.\n",
+                    __func__,
+                    cacheBlocks[block_index].addr + i * sizeof(WorkListItem));
+                } else {
+                    // peerPushEngine->setPushAlarm();
+                    // pendingPushAlarm = true;
+                    return;
+                }
+            }
+        }
+        // TODO: This is where eviction policy goes
+        evictQueue.push_back(block_index);
+    }
+
+    applyQueue.pop_front();
+
+    if ((!evictQueue.empty()) &&
+        (!pendingAlarm()) &&
+        (!nextEvictEvent.scheduled())) {
+        schedule(nextEvictEvent, nextCycle());
+    }
+
+    if ((!applyQueue.empty()) &&
+        (!nextApplyEvent.scheduled())) {
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextEvictEvent()
+{
+    int block_index = evictQueue.front();
+
+    if (cacheBlocks[block_index].takenMask) {
+        DPRINTF(MPU, "%s: cache line [%d] has been taken amid evict process. "
+                    "Therefore, ignoring the apply schedule.\n",
+                    __func__, block_index);
+        stats.falseEvictSchedules++;
+    } else {
+        int space_needed = cacheBlocks
+    }
+}
+
 void
 CoalesceEngine::processNextApplyAndCommitEvent()
 {

From 7e7f09d79330b2de27c62d3d07e7bf141c20ccd3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 19 May 2022 12:10:10 -0700
Subject: [PATCH 102/247] Decoupling apply and evict. Done.

---
 configs/accl/sega.py                   |   2 +-
 src/accl/graph/sega/coalesce_engine.cc | 214 ++++++++-----------------
 src/accl/graph/sega/coalesce_engine.hh |  11 +-
 3 files changed, 81 insertions(+), 146 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index b799b05dc5..9d8b449e0f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -122,7 +122,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
                                             response_latency=1,
                                             width=64)
 
-        self.ctrl = CenteralController(addr=192, value=0,
+        self.ctrl = CenteralController(addr=0, value=0,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 92d82bce35..f3402255bc 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -45,7 +45,8 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
-    nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
+    nextApplyEvent([this] { processNextApplyEvent(); }, name()),
+    nextEvictEvent([this] { processNextEvictEvent(); }, name()),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -237,8 +238,8 @@ CoalesceEngine::processNextRespondEvent()
 void
 CoalesceEngine::respondToAlarm()
 {
-    assert(!nextApplyAndCommitEvent.scheduled());
-    schedule(nextApplyAndCommitEvent, nextCycle());
+    assert(pendingAlarm() && (!nextEvictEvent.scheduled()));
+    schedule(nextEvictEvent, nextCycle());
 }
 
 bool
@@ -362,16 +363,15 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
             }
         }
         if (!found) {
-            evictQueue.push_back(block_index);
+            applyQueue.push_back(block_index);
         }
         DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
                     __func__, block_index, evictQueue.size());
     }
 
-    if ((!nextApplyAndCommitEvent.scheduled()) &&
-        (!evictQueue.empty()) &&
-        (!pendingAlarm())) {
-        schedule(nextApplyAndCommitEvent, nextCycle());
+    if ((!applyQueue.empty()) &&
+        (!nextApplyEvent.scheduled())) {
+        schedule(nextApplyEvent, nextCycle());
     }
 
 }
@@ -442,150 +442,74 @@ CoalesceEngine::processNextEvictEvent()
                     __func__, block_index);
         stats.falseEvictSchedules++;
     } else {
-        int space_needed = cacheBlocks
-    }
-}
-
-void
-CoalesceEngine::processNextApplyAndCommitEvent()
-{
-    // FIXME: Refactor the line below to work with the new inheritance.
-    // assert((!alarmRequested) && (spaceRequested == 0));
-    int block_index = evictQueue.front();
-    uint8_t changedMask = 0;
-
-    DPRINTF(MPU, "%s: Received nextApplyAndCommitEvent for cache line[%d].\n",
-                __func__, block_index);
-    DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and "
-                "then commited.\n", __func__, block_index);
-
-    if (cacheBlocks[block_index].takenMask == 0) {
-        if ((cacheBlocks[block_index].hasChange) &&
-            (cacheBlocks[block_index].hasConflict) &&
-            (memReqQueueHasSpace(2))) {
-            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                        __func__, block_index);
-        } else if ((cacheBlocks[block_index].hasChange) &&
-                    (!cacheBlocks[block_index].hasConflict) &&
-                    (memReqQueueHasSpace(1))) {
-            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                        __func__, block_index);
-        } else if ((!cacheBlocks[block_index].hasChange) &&
-                    (cacheBlocks[block_index].hasConflict) &&
-                    (memReqQueueHasSpace(1))) {
-            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                        __func__, block_index);
-        } else if ((!cacheBlocks[block_index].hasChange) &&
-                    (!cacheBlocks[block_index].hasConflict)) {
-            DPRINTF(MPU, "%s: No ApplyAndCommit needed for cache line[%d].\n",
-                        __func__, block_index);
-        } else {
-            int spaceNeeded = cacheBlocks[block_index].hasConflict ? 2 : 1;
-            requestAlarm(spaceNeeded);
-            DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set "
-            "an alarm for nextApplyAndCommitEvent when there is %d space.\n",
-            __func__, spaceNeeded);
+        int space_needed = cacheBlocks[block_index].hasChange ?
+                        (cacheBlocks[block_index].hasConflict ? 2 : 1) :
+                        (cacheBlocks[block_index].hasConflict ? 1 : 0);
+        if (!memReqQueueHasSpace(space_needed)) {
+            DPRINTF(MPU, "%s: There is not enough space in memReqQueue to "
+                    "procees the eviction of cache line [%d]. hasChange: %d, "
+                    "hasConflict: %d.\n", __func__, block_index,
+                    cacheBlocks[block_index].hasChange,
+                    cacheBlocks[block_index].hasConflict);
+            requestAlarm(space_needed);
             return;
-        }
-
-        // Reducing between tempProp and prop for each item in the cache line.
-        for (int i = 0; i < numElementsPerLine; i++) {
-            uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
-            cacheBlocks[block_index].items[i].prop = std::min(
-                cacheBlocks[block_index].items[i].prop,
-                cacheBlocks[block_index].items[i].tempProp);
-            DPRINTF(MPU, "%s: Applied cache line[%d][%d] = %s.\n", __func__,
-                        block_index, i,
-                        cacheBlocks[block_index].items[i].to_string());
-            if (old_prop != cacheBlocks[block_index].items[i].prop) {
-                changedMask |= (1 << i);
-                // TODO: Add a stat to count the number of changed props.
-                DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
-                            __func__, block_index, i);
+        } else {
+            if (cacheBlocks[block_index].hasChange) {
+                DPRINTF(MPU, "%s: Change observed on cache line [%d].\n",
+                            __func__, block_index);
+                PacketPtr write_pkt = createWritePacket(
+                    cacheBlocks[block_index].addr, peerMemoryAtomSize,
+                    (uint8_t*) cacheBlocks[block_index].items);
+                DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, "
+                            "size = %d.\n", __func__,
+                            write_pkt->getAddr(), write_pkt->getSize());
+                enqueueMemReq(write_pkt);
             }
-        }
 
-        if (cacheBlocks[block_index].hasChange) {
-            DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
-                        , __func__, block_index);
-
-            PacketPtr write_pkt = createWritePacket(
-                cacheBlocks[block_index].addr, peerMemoryAtomSize,
-                (uint8_t*) cacheBlocks[block_index].items);
-            DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = %d.\n",
-                        __func__, write_pkt->getAddr(), peerMemoryAtomSize);
-            enqueueMemReq(write_pkt);
-            DPRINTF(MPU, "%s: Added the evicting write back packet to "
-                        "outstandingMemReqQueue.\n" , __func__);
-
-            for (int i = 0; i < numElementsPerLine; i++) {
-                if ((changedMask & (1 << i)) == (1 << i)) {
-                    DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n",
-                    __func__, cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
-                    cacheBlocks[block_index].items[i].to_string());
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                                        __func__, block_index, i);
-                }
-            }
-        }
+            if (cacheBlocks[block_index].hasConflict) {
+                assert(!MSHRMap[block_index].empty());
+                Addr miss_addr = MSHRMap[block_index].front();
+                DPRINTF(MPU, "%s: First conflicting address for cache line[%d]"
+                        " is Addr: %lu.\n", __func__, block_index, miss_addr);
 
-        if (cacheBlocks[block_index].hasConflict) {
-            assert(!MSHRMap[block_index].empty());
-            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
-                        "enough space in outstandingMemReqQueue for a read "
-                        "packet.\n", __func__, block_index);
-            Addr miss_addr = MSHRMap[block_index][0];
-            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
-                        " Addr: %lu.\n", __func__, block_index, miss_addr);
-
-            Addr aligned_miss_addr =
-                std::floor(miss_addr / peerMemoryAtomSize) *
+                Addr aligned_miss_addr =
+                    std::floor(miss_addr / peerMemoryAtomSize) *
                     peerMemoryAtomSize;
-            PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
-                                                    peerMemoryAtomSize);
-            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                        " req addr (aligned_addr) = %lu, size = %d.\n",
-                        __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize);
-            enqueueMemReq(read_pkt);
-            DPRINTF(MPU, "%s: Added the evicting write back packet along with "
-                        "its subsequent read packet (to service the conflicts)"
-                        " to outstandingMemReqQueue.\n" , __func__);
-
-            cacheBlocks[block_index].addr = aligned_miss_addr;
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = true;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = true;
-            cacheBlocks[block_index].hasChange = false;
-        } else {
-            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
-                    "enough space in outstandingMemReqQueue for the write back"
-                    " packet.\n", __func__, block_index);
-            DPRINTF(MPU, "%s: Added the write back packet to "
-                        "outstandingMemReqQueue.\n", __func__);
-
-            // Since allocated is false, does not matter what the address is.
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = false;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = false;
-            cacheBlocks[block_index].hasChange = false;
-        }
+                PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
+                                                        peerMemoryAtomSize);
+                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                            " req addr (aligned_addr) = %lu, size = %d.\n",
+                            __func__, miss_addr,
+                            read_pkt->getAddr(), read_pkt->getSize());
+                enqueueMemReq(read_pkt);
+
+                cacheBlocks[block_index].addr = aligned_miss_addr;
+                cacheBlocks[block_index].takenMask = 0;
+                cacheBlocks[block_index].allocated = true;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = true;
+                cacheBlocks[block_index].hasChange = false;
+                DPRINTF(MPU, "%s: Allocated cache line [%d] for Addr: %lu.\n",
+                            __func__, block_index, aligned_miss_addr);
+            } else {
 
-    } else {
-        DPRINTF(MPU, "%s: cache line[%d] has been read since being scheduled "
-                    "for eviction. Therefore, ignoring the evict schedule.\n",
-                    __func__, block_index);
+                // Since allocated is false, does not matter what the address is.
+                cacheBlocks[block_index].takenMask = 0;
+                cacheBlocks[block_index].allocated = false;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = false;
+                cacheBlocks[block_index].hasChange = false;
+                DPRINTF(MPU, "%s: Deallocated cache line [%d].\n",
+                            __func__, block_index);
+            }
+        }
     }
 
     evictQueue.pop_front();
-    DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                        " = %u.\n", __func__, evictQueue.size());
 
-    if ((!nextApplyAndCommitEvent.scheduled()) &&
-        (!evictQueue.empty())) {
-        schedule(nextApplyAndCommitEvent, nextCycle());
+    if ((!evictQueue.empty()) &&
+        (!nextEvictEvent.scheduled())) {
+        schedule(nextEvictEvent, nextCycle());
     }
 }
 
@@ -604,7 +528,11 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
              "Number of cache hit under misses."),
     ADD_STAT(readRejections, statistics::units::Count::get(),
-             "Number of cache rejections.")
+             "Number of cache rejections."),
+    ADD_STAT(falseApplySchedules, statistics::units::Count::get(),
+             "Number of failed apply schedules."),
+    ADD_STAT(falseEvictSchedules, statistics::units::Count::get(),
+             "Number of failed evict schedules.")
 {
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index e86014fc25..82b03f53aa 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -83,13 +83,18 @@ class CoalesceEngine : public BaseMemEngine
 
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
+    std::deque<int> applyQueue;
+
     std::deque<int> evictQueue;
 
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
 
-    EventFunctionWrapper nextApplyAndCommitEvent;
-    void processNextApplyAndCommitEvent();
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
+
+    EventFunctionWrapper nextEvictEvent;
+    void processNextEvictEvent();
 
     struct CoalesceStats : public statistics::Group
     {
@@ -105,6 +110,8 @@ class CoalesceEngine : public BaseMemEngine
       statistics::Scalar readMisses;
       statistics::Scalar readHitUnderMisses;
       statistics::Scalar readRejections;
+      statistics::Scalar falseApplySchedules;
+      statistics::Scalar falseEvictSchedules;
     };
 
     CoalesceStats stats;

From 550a9fed64190cb41db8366425e3b793c8c5ada8 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 19 May 2022 21:20:07 -0700
Subject: [PATCH 103/247] Fixed miss-deallocation bug. Hopefully.

---
 configs/accl/sega.py                   |   2 +-
 src/accl/graph/base/base_mem_engine.cc |  24 +++---
 src/accl/graph/base/base_mem_engine.hh |  17 ++--
 src/accl/graph/sega/coalesce_engine.cc | 107 +++++++++++++++++--------
 src/accl/graph/sega/coalesce_engine.hh |  13 +--
 src/accl/graph/sega/push_engine.cc     |  26 ++++--
 src/accl/graph/sega/push_engine.hh     |  11 ++-
 src/accl/graph/sega/wl_engine.cc       |   1 -
 src/accl/graph/sega/wl_engine.hh       |   1 -
 9 files changed, 136 insertions(+), 66 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 9d8b449e0f..31b65ae726 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,7 +9,7 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=0,
+                                    push_req_queue_size=16,
                                     attached_memory_atom_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index 50e64ae7c3..f02f1d2feb 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -37,8 +37,8 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     system(params.system),
     memPort(name() + ".mem_port", this),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
-    alarmRequested(false),
-    spaceRequested(0),
+    memAlarmRequested(false),
+    memSpaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     _requestorId(system->getRequestorId(this)),
     peerMemoryAtomSize(params.attached_memory_atom_size)
@@ -106,12 +106,12 @@ BaseMemEngine::processNextMemReqEvent()
                 __func__, pkt->getAddr(), pkt->getSize());
     outstandingMemReqQueue.pop_front();
 
-    if (alarmRequested &&
+    if (memAlarmRequested &&
         (outstandingMemReqQueue.size() <=
-        (outstandingMemReqQueueSize - spaceRequested))) {
-        alarmRequested = false;
-        spaceRequested = 0;
-        respondToAlarm();
+        (outstandingMemReqQueueSize - memSpaceRequested))) {
+        memAlarmRequested = false;
+        memSpaceRequested = 0;
+        respondToMemAlarm();
     }
 
     if ((!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
@@ -151,7 +151,7 @@ BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 }
 
 bool
-BaseMemEngine::memReqQueueHasSpace(int space)
+BaseMemEngine::allocateMemReqSpace(int space)
 {
     assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
     return (
@@ -179,13 +179,13 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 }
 
 void
-BaseMemEngine::requestAlarm(int space) {
-    panic_if((alarmRequested == true) || (spaceRequested != 0),
+BaseMemEngine::requestMemAlarm(int space) {
+    panic_if((memAlarmRequested == true) || (memSpaceRequested != 0),
             "You should not request another alarm without the first one being"
             "responded to.\n");
     DPRINTF(MPU, "%s: Alarm requested with space = %d.\n", __func__, space);
-    alarmRequested = true;
-    spaceRequested = space;
+    memAlarmRequested = true;
+    memSpaceRequested = space;
 }
 
 void
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
index fb7cab91b0..8a18807e2e 100644
--- a/src/accl/graph/base/base_mem_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -69,8 +69,8 @@ class BaseMemEngine : public ClockedObject
     MemPort memPort;
 
     int outstandingMemReqQueueSize;
-    bool alarmRequested;
-    int spaceRequested;
+    bool memAlarmRequested;
+    int memSpaceRequested;
     std::deque<PacketPtr> outstandingMemReqQueue;
 
     EventFunctionWrapper nextMemReqEvent;
@@ -81,15 +81,16 @@ class BaseMemEngine : public ClockedObject
 
     size_t peerMemoryAtomSize;
 
-    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
-
-    bool memReqQueueHasSpace(int space);
+    bool allocateMemReqSpace(int space);
     bool memReqQueueFull();
+
+    bool pendingMemAlarm() { return memAlarmRequested; }
+    void requestMemAlarm(int space);
+
+    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
     void enqueueMemReq(PacketPtr pkt);
-    bool pendingAlarm() { return alarmRequested; }
-    void requestAlarm(int space);
 
-    virtual void respondToAlarm() = 0;
+    virtual void respondToMemAlarm() = 0;
     virtual bool handleMemResp(PacketPtr pkt) = 0;
 
     PacketPtr createReadPacket(Addr addr, unsigned int size);
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index f3402255bc..36faff2c6a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -44,6 +44,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
+    pendingPushAlarm(false),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
     nextEvictEvent([this] { processNextEvictEvent(); }, name()),
@@ -54,6 +55,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
+    peerPushEngine->registerCoalesceEngine(this);
 }
 
 void
@@ -91,10 +93,11 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     cacheBlocks[block_index].items[wl_offset]));
         DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
             "to responseQueue. responseQueue.size = %d.\n",
-            __func__, addr, block_index, wl_offset, responseQueue.size(),
-            cacheBlocks[block_index].items[wl_offset].to_string());
+            __func__, addr, block_index, wl_offset,
+            cacheBlocks[block_index].items[wl_offset].to_string(),
+            responseQueue.size());
         // TODO: Add a stat to count the number of WLItems that have been touched.
-        cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+        cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         stats.readHits++;
 
         assert(!responseQueue.empty());
@@ -156,7 +159,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                         return false;
                     }
                     cacheBlocks[block_index].addr = aligned_addr;
-                    cacheBlocks[block_index].takenMask = 0;
+                    cacheBlocks[block_index].busyMask = 0;
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
@@ -236,9 +239,9 @@ CoalesceEngine::processNextRespondEvent()
 }
 
 void
-CoalesceEngine::respondToAlarm()
+CoalesceEngine::respondToMemAlarm()
 {
-    assert(pendingAlarm() && (!nextEvictEvent.scheduled()));
+    assert(pendingMemAlarm() && (!nextEvictEvent.scheduled()));
     schedule(nextEvictEvent, nextCycle());
 }
 
@@ -290,7 +293,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
             // TODO: Add a stat to count the number of WLItems that have been touched.
-            cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+            cacheBlocks[block_index].busyMask |= (1 << wl_offset);
             // End of the said block
 
             servicedIndices.push_back(i);
@@ -336,27 +339,27 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     DPRINTF(MPU, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
                 __func__, wl.to_string(), addr);
-    assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) ==
+    assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
 
     if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) {
-        cacheBlocks[block_index].hasChange = true;
+        cacheBlocks[block_index].dirty = true;
         stats.numVertexWrites++;
     }
 
     cacheBlocks[block_index].items[wl_offset] = wl;
-    cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
+    cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
-    if ((cacheBlocks[block_index].takenMask == 0)) {
+    if ((cacheBlocks[block_index].busyMask == 0)) {
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         // TODO: Fix this hack
         bool found = false;
-        for (auto i : evictQueue) {
+        for (auto i : applyQueue) {
             if (i == block_index) {
                 found = true;
                 break;
@@ -364,12 +367,13 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
         }
         if (!found) {
             applyQueue.push_back(block_index);
+            DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
+                    __func__, block_index, applyQueue.size());
         }
-        DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
-                    __func__, block_index, evictQueue.size());
     }
 
     if ((!applyQueue.empty()) &&
+        (!pendingPushAlarm) &&
         (!nextApplyEvent.scheduled())) {
         schedule(nextApplyEvent, nextCycle());
     }
@@ -381,16 +385,27 @@ CoalesceEngine::processNextApplyEvent()
 {
     int block_index = applyQueue.front();
 
-    if (cacheBlocks[block_index].takenMask) {
+    if (cacheBlocks[block_index].busyMask) {
         DPRINTF(MPU, "%s: cache line [%d] has been taken amid apply process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseApplySchedules++;
-    } else if (!cacheBlocks[block_index].hasChange) {
+    } else if (!cacheBlocks[block_index].dirty) {
         DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply "
                     "needed. Adding the cache line to evict schedule.\n",
                     __func__, block_index);
-        evictQueue.push_back(block_index);
+        bool found = false;
+        for (auto i : evictQueue) {
+            if (i == block_index) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            evictQueue.push_back(block_index);
+            DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
+                    __func__, block_index, evictQueue.size());
+        }
     } else {
         for (int i = 0; i < numElementsPerLine; i++) {
             uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
@@ -407,20 +422,32 @@ CoalesceEngine::processNextApplyEvent()
                     __func__,
                     cacheBlocks[block_index].addr + i * sizeof(WorkListItem));
                 } else {
-                    // peerPushEngine->setPushAlarm();
-                    // pendingPushAlarm = true;
+                    peerPushEngine->setPushAlarm();
+                    pendingPushAlarm = true;
                     return;
                 }
             }
         }
         // TODO: This is where eviction policy goes
-        evictQueue.push_back(block_index);
+        // TODO: Fix this hack.
+        bool found = false;
+        for (auto i : evictQueue) {
+            if (i == block_index) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            evictQueue.push_back(block_index);
+            DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
+                    __func__, block_index, evictQueue.size());
+        }
     }
 
     applyQueue.pop_front();
 
     if ((!evictQueue.empty()) &&
-        (!pendingAlarm()) &&
+        (!pendingMemAlarm()) &&
         (!nextEvictEvent.scheduled())) {
         schedule(nextEvictEvent, nextCycle());
     }
@@ -436,25 +463,33 @@ CoalesceEngine::processNextEvictEvent()
 {
     int block_index = evictQueue.front();
 
-    if (cacheBlocks[block_index].takenMask) {
+    bool found_in_apply_queue = false;
+    for (auto i : applyQueue) {
+        if (i == block_index) {
+            found_in_apply_queue = true;
+            break;
+        }
+    }
+    if ((cacheBlocks[block_index].busyMask) ||
+        (found_in_apply_queue)) {
         DPRINTF(MPU, "%s: cache line [%d] has been taken amid evict process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseEvictSchedules++;
     } else {
-        int space_needed = cacheBlocks[block_index].hasChange ?
+        int space_needed = cacheBlocks[block_index].dirty ?
                         (cacheBlocks[block_index].hasConflict ? 2 : 1) :
                         (cacheBlocks[block_index].hasConflict ? 1 : 0);
-        if (!memReqQueueHasSpace(space_needed)) {
+        if (!allocateMemReqSpace(space_needed)) {
             DPRINTF(MPU, "%s: There is not enough space in memReqQueue to "
-                    "procees the eviction of cache line [%d]. hasChange: %d, "
+                    "procees the eviction of cache line [%d]. dirty: %d, "
                     "hasConflict: %d.\n", __func__, block_index,
-                    cacheBlocks[block_index].hasChange,
+                    cacheBlocks[block_index].dirty,
                     cacheBlocks[block_index].hasConflict);
-            requestAlarm(space_needed);
+            requestMemAlarm(space_needed);
             return;
         } else {
-            if (cacheBlocks[block_index].hasChange) {
+            if (cacheBlocks[block_index].dirty) {
                 DPRINTF(MPU, "%s: Change observed on cache line [%d].\n",
                             __func__, block_index);
                 PacketPtr write_pkt = createWritePacket(
@@ -484,21 +519,21 @@ CoalesceEngine::processNextEvictEvent()
                 enqueueMemReq(read_pkt);
 
                 cacheBlocks[block_index].addr = aligned_miss_addr;
-                cacheBlocks[block_index].takenMask = 0;
+                cacheBlocks[block_index].busyMask = 0;
                 cacheBlocks[block_index].allocated = true;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = true;
-                cacheBlocks[block_index].hasChange = false;
+                cacheBlocks[block_index].dirty = false;
                 DPRINTF(MPU, "%s: Allocated cache line [%d] for Addr: %lu.\n",
                             __func__, block_index, aligned_miss_addr);
             } else {
 
                 // Since allocated is false, does not matter what the address is.
-                cacheBlocks[block_index].takenMask = 0;
+                cacheBlocks[block_index].busyMask = 0;
                 cacheBlocks[block_index].allocated = false;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].hasChange = false;
+                cacheBlocks[block_index].dirty = false;
                 DPRINTF(MPU, "%s: Deallocated cache line [%d].\n",
                             __func__, block_index);
             }
@@ -513,6 +548,14 @@ CoalesceEngine::processNextEvictEvent()
     }
 }
 
+void
+CoalesceEngine::respondToPushAlarm()
+{
+    assert(pendingPushAlarm && (!nextApplyEvent.scheduled()));
+    pendingPushAlarm = false;
+    schedule(nextApplyEvent, nextCycle());
+}
+
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 82b03f53aa..824faef10d 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -49,21 +49,21 @@ class CoalesceEngine : public BaseMemEngine
     {
         WorkListItem* items;
         Addr addr;
-        uint8_t takenMask;
+        uint8_t busyMask;
         bool allocated;
         bool valid;
         bool hasConflict;
-        bool hasChange;
+        bool dirty;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block() {}
         Block(int num_elements):
           addr(0),
-          takenMask(0),
+          busyMask(0),
           allocated(false),
           valid(false),
           hasConflict(false),
-          hasChange(false)
+          dirty(false)
         {
           items = new WorkListItem [num_elements];
         }
@@ -83,6 +83,7 @@ class CoalesceEngine : public BaseMemEngine
 
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
+    bool pendingPushAlarm;
     std::deque<int> applyQueue;
 
     std::deque<int> evictQueue;
@@ -117,7 +118,7 @@ class CoalesceEngine : public BaseMemEngine
     CoalesceStats stats;
 
   protected:
-    virtual void respondToAlarm();
+    virtual void respondToMemAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
@@ -131,6 +132,8 @@ class CoalesceEngine : public BaseMemEngine
     void recvWLWrite(Addr addr, WorkListItem wl);
 
     void registerWLEngine(WLEngine* wl_engine);
+
+    void respondToPushAlarm();
 };
 
 }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 69b9f3f23e..d5563cca7c 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -28,6 +28,7 @@
 
 #include "accl/graph/sega/push_engine.hh"
 
+#include "accl/graph/sega/coalesce_engine.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
@@ -36,6 +37,7 @@ namespace gem5
 
 PushEngine::PushEngine(const PushEngineParams &params):
     BaseMemEngine(params),
+    pushAlarmSet(false),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
@@ -56,6 +58,12 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine)
+{
+    peerCoalesceEngine = coalesce_engine;
+}
+
 void
 PushEngine::ReqPort::sendPacket(PacketPtr pkt)
 {
@@ -146,11 +154,15 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(MPU, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
+        if (pushAlarmSet && (pushReqQueue.size() == pushReqQueueSize - 1)) {
+            pushAlarmSet = false;
+            peerCoalesceEngine->respondToPushAlarm();
+        }
     }
 
     if (memReqQueueFull()) {
         if (!pushReqQueue.empty()) {
-            requestAlarm(1);
+            requestMemAlarm(1);
         }
         return;
     }
@@ -161,7 +173,7 @@ PushEngine::processNextAddrGenEvent()
 }
 
 void
-PushEngine::respondToAlarm()
+PushEngine::respondToMemAlarm()
 {
     assert(!nextAddrGenEvent.scheduled());
     schedule(nextAddrGenEvent, nextCycle());
@@ -200,9 +212,6 @@ PushEngine::processNextPushEvent()
 
     // TODO: Implement propagate function here
     uint32_t update_value = value + 1;
-    DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
-            __func__, curr_edge->neighbor, update_value);
-
     PacketPtr update = createUpdatePacket<uint32_t>(
                             curr_edge->neighbor, update_value);
 
@@ -249,6 +258,13 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
+void
+PushEngine::setPushAlarm()
+{
+    assert(!pushAlarmSet);
+    pushAlarmSet = true;
+}
+
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 7a6981daa0..ce24f862ba 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -36,6 +36,8 @@
 namespace gem5
 {
 
+class CoalesceEngine;
+
 class PushEngine : public BaseMemEngine
 {
   private:
@@ -95,6 +97,9 @@ class PushEngine : public BaseMemEngine
         virtual void recvReqRetry();
     };
 
+    bool pushAlarmSet;
+    CoalesceEngine* peerCoalesceEngine;
+
     ReqPort reqPort;
 
     Addr baseEdgeAddr;
@@ -134,7 +139,7 @@ class PushEngine : public BaseMemEngine
     PushStats stats;
 
   protected:
-    virtual void respondToAlarm();
+    virtual void respondToMemAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
@@ -145,6 +150,10 @@ class PushEngine : public BaseMemEngine
                 PortID idx=InvalidPortID) override;
 
     bool recvWLItem(WorkListItem wl);
+
+    void registerCoalesceEngine(CoalesceEngine* coalesce_engine);
+
+    void setPushAlarm();
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index e949cbcf5b..75ac4f784e 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -37,7 +37,6 @@ namespace gem5
 WLEngine::WLEngine(const WLEngineParams &params):
     BaseReduceEngine(params),
     respPort(name() + ".resp_port", this),
-    blockedByCoalescer(false),
     coalesceEngine(params.coalesce_engine),
     updateQueueSize(params.update_queue_size),
     onTheFlyUpdateMapSize(params.on_the_fly_update_map_size),
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 597fdb2b1e..27fc3efa7a 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -69,7 +69,6 @@ class WLEngine : public BaseReduceEngine
 
     RespPort respPort;
 
-    bool blockedByCoalescer;
     CoalesceEngine* coalesceEngine;
 
     int updateQueueSize;

From 929aab118886fde9e286876fd2dc997be0a8684c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 22 May 2022 14:15:30 -0700
Subject: [PATCH 104/247] Correctness passed with finite push queue and
 facebook graph.

---
 configs/accl/sega.py                   |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 20 +++++++++++++-------
 src/accl/graph/sega/push_engine.cc     | 13 ++++++-------
 src/accl/graph/sega/push_engine.hh     |  3 ++-
 4 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 31b65ae726..8a6ac783c3 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -158,6 +158,6 @@ def get_inputs():
 
     m5.instantiate()
 
-    exit_event = m5.simulate(1000000000000)
+    exit_event = m5.simulate()
     print("Simulation finished!")
     exit()
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 36faff2c6a..39144972df 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -349,7 +349,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
-    DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
+    DPRINTF(MPU, "%s: Wrote to cache line[%d][%d] = %s.\n",
+                __func__, block_index, wl_offset,
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
@@ -409,15 +410,20 @@ CoalesceEngine::processNextApplyEvent()
     } else {
         for (int i = 0; i < numElementsPerLine; i++) {
             uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
-            cacheBlocks[block_index].items[i].prop = std::min(
+            uint32_t new_prop = std::min(
                                 cacheBlocks[block_index].items[i].prop,
                                 cacheBlocks[block_index].items[i].tempProp);
-            // TODO: Is this correct?
-            cacheBlocks[block_index].items[i].tempProp = cacheBlocks[block_index].items[i].prop;
 
-            if (cacheBlocks[block_index].items[i].prop != old_prop) {
-                if (peerPushEngine->recvWLItem(
-                    cacheBlocks[block_index].items[i])) {
+            if (new_prop != old_prop) {
+                if (peerPushEngine->allocatePushSpace()) {
+                    cacheBlocks[block_index].items[i].tempProp = new_prop;
+                    cacheBlocks[block_index].items[i].prop = new_prop;
+                    DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n",
+                    __func__,
+                    cacheBlocks[block_index].addr + (i  * sizeof(WorkListItem)),
+                    cacheBlocks[block_index].items[i].to_string());
+                    peerPushEngine->recvWLItem(
+                                        cacheBlocks[block_index].items[i]);
                     DPRINTF(MPU, "%s: Sent WorkListItem [%d] to PushEngine.\n",
                     __func__,
                     cacheBlocks[block_index].addr + i * sizeof(WorkListItem));
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d5563cca7c..8cfe3c72cc 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -97,7 +97,7 @@ PushEngine::ReqPort::recvReqRetry()
     }
 }
 
-bool
+void
 PushEngine::recvWLItem(WorkListItem wl)
 {
     // If there are no outdoing edges, no need to generate and push
@@ -105,14 +105,14 @@ PushEngine::recvWLItem(WorkListItem wl)
     if (wl.degree == 0) {
         DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n",
                     __func__, wl.to_string());
-        return true;
+        return;
     }
 
     assert((pushReqQueueSize == 0) ||
-        (pushReqQueue.size() <= pushReqQueueSize));
-    if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
-        return false;
-    }
+        (pushReqQueue.size() < pushReqQueueSize));
+    panic_if(pushReqQueue.size() == pushReqQueueSize, "You should call this "
+                "method after checking if there is enough push space. Use "
+                "allocatePushSpace.\n");
 
     Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
@@ -125,7 +125,6 @@ PushEngine::recvWLItem(WorkListItem wl)
         (!memReqQueueFull())) {
         schedule(nextAddrGenEvent, nextCycle());
     }
-    return true;
 }
 
 void
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index ce24f862ba..ae465f6eb1 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -149,7 +149,8 @@ class PushEngine : public BaseMemEngine
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
-    bool recvWLItem(WorkListItem wl);
+    bool allocatePushSpace() { return pushReqQueue.size() < pushReqQueueSize; }
+    void recvWLItem(WorkListItem wl);
 
     void registerCoalesceEngine(CoalesceEngine* coalesce_engine);
 

From e16c0deadb328f6496d9f424a21cd3677a5ce542 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 22 May 2022 17:49:06 -0700
Subject: [PATCH 105/247] Fixing an incorrect assertion.

---
 configs/accl/sega.py                   | 23 +++++++++++++++++------
 src/accl/graph/sega/coalesce_engine.cc |  2 +-
 src/accl/graph/sega/push_engine.cc     |  1 -
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8a6ac783c3..11e2cfb6af 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,7 +9,7 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=16,
+                                    push_req_queue_size=64,
                                     attached_memory_atom_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
@@ -109,7 +109,12 @@ def setEdgePort(self, port, i):
         self.edge_mem_ctrl[i].port = port
 
 class SEGA(System):
-    def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
+    def __init__(self,
+                num_mpus,
+                vertex_cache_line_size,
+                graph_path,
+                first_addr,
+                first_value):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
@@ -122,7 +127,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
                                             response_latency=1,
                                             width=64)
 
-        self.ctrl = CenteralController(addr=0, value=0,
+        self.ctrl = CenteralController(addr=first_addr, value=first_value,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
@@ -147,13 +152,19 @@ def get_inputs():
     argparser.add_argument("num_mpus", type=int)
     argparser.add_argument("vertex_cache_line_size", type=int)
     argparser.add_argument("graph_path", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
     args = argparser.parse_args()
-    return args.num_mpus, args.vertex_cache_line_size, args.graph_path
+    return args.num_mpus, args.vertex_cache_line_size, \
+            args.graph_path, args.init_addr, args.init_value
 
 if __name__ == "__m5_main__":
-    num_mpus, vertex_cache_line_size, graph_path = get_inputs()
+    num_mpus, vertex_cache_line_size, \
+        graph_path, first_addr, first_value = get_inputs()
+
     print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
-    system = SEGA(num_mpus, vertex_cache_line_size, graph_path)
+    system = SEGA(num_mpus, vertex_cache_line_size, \
+                graph_path, first_addr, first_value)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 39144972df..dd651f9e5a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -241,7 +241,7 @@ CoalesceEngine::processNextRespondEvent()
 void
 CoalesceEngine::respondToMemAlarm()
 {
-    assert(pendingMemAlarm() && (!nextEvictEvent.scheduled()));
+    assert(!nextEvictEvent.scheduled());
     schedule(nextEvictEvent, nextCycle());
 }
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 8cfe3c72cc..ed23fb4d4b 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -176,7 +176,6 @@ PushEngine::respondToMemAlarm()
 {
     assert(!nextAddrGenEvent.scheduled());
     schedule(nextAddrGenEvent, nextCycle());
-    DPRINTF(MPU, "%s: Responded to an alarm.\n", __func__);
 }
 
 bool

From 83af4b3b2720bdb7d0ab3b836c4f0c2516b1a950 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 3 Jun 2022 07:44:25 -0700
Subject: [PATCH 106/247] Converting apply and evict queues to FIFOSet.

---
 src/accl/graph/base/data_structs.hh    | 50 +++++++++++++++++++
 src/accl/graph/sega/coalesce_engine.cc | 68 +++++++++-----------------
 src/accl/graph/sega/coalesce_engine.hh |  4 +-
 src/accl/graph/sega/push_engine.hh     |  3 +-
 4 files changed, 76 insertions(+), 49 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 7535d4bbac..e03686a7e9 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -32,6 +32,9 @@
 #include "base/cprintf.hh"
 #include "base/intmath.hh"
 
+#include <unordered_set>
+#include <queue>
+
 namespace gem5
 {
 
@@ -83,6 +86,53 @@ struct __attribute__ ((packed)) Edge
 };
 
 static_assert(isPowerOf2(sizeof(WorkListItem)));
+static_assert(isPowerOf2(sizeof(Edge)));
+
+template<typename T>
+class FIFOSet
+{
+    private:
+        std::queue<T> fifo;
+        std::unordered_set<T> set;
+
+    public:
+        FIFOSet(int cap)
+        {
+            set.reserve(cap);
+        }
+
+        void push_back(T item)
+        {
+            if (set.find(item) == set.end()) {
+                set.insert(item);
+                fifo.push(item);
+            }
+        }
+
+        void pop_front()
+        {
+            T front = fifo.front();
+            set.erase(front);
+            fifo.pop();
+        }
+
+        T& front()
+        {
+            return fifo.front();
+        }
+
+        size_t size() {
+            return fifo.size();
+        }
+
+        bool empty() {
+            return fifo.empty();
+        }
+
+        bool find(T item) {
+            return (set.find(item) != set.end());
+        }
+};
 
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index dd651f9e5a..f96adbf8d8 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -45,6 +45,8 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     pendingPushAlarm(false),
+    applyQueue(numLines),
+    evictQueue(numLines),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
     nextEvictEvent([this] { processNextEvictEvent(); }, name()),
@@ -55,6 +57,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
+
     peerPushEngine->registerCoalesceEngine(this);
 }
 
@@ -141,14 +144,18 @@ CoalesceEngine::recvReadAddr(Addr addr)
                                 "line[%d]", __func__, addr, block_index);
                     stats.readMisses++;
                     stats.numVertexReads++;
+                    if (!cacheBlocks[block_index].busyMask) {
+                        applyQueue.push_back(block_index);
+                        assert(!applyQueue.empty());
+                        if ((!nextApplyEvent.scheduled()) &&
+                            (!pendingPushAlarm)) {
+                            schedule(nextApplyEvent, nextCycle());
+                        }
+                    }
                     return true;
                 } else {
                     assert(!cacheBlocks[block_index].valid);
                     // MSHR available and no conflict
-                    //TODO: Fix this to work with new inheritance.
-                    // assert(
-                    //     outstandingMemReqQueue.size() <=
-                    //     outstandingMemReqQueueSize);
                     DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to "
                                 "allocate a cache line for it.\n",
                                 __func__, addr);
@@ -278,8 +285,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHRMap[block_index].size(); i++) {
         Addr miss_addr = MSHRMap[block_index][i];
-        Addr aligned_miss_addr = std::floor(miss_addr / peerMemoryAtomSize) * peerMemoryAtomSize;
-
+        Addr aligned_miss_addr = roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
             DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
@@ -333,7 +339,7 @@ void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
     // TODO: Parameterize all the numbers here.
-    Addr aligned_addr = std::floor(addr / peerMemoryAtomSize) * peerMemoryAtomSize;
+    Addr aligned_addr = roundDown<Addr, Addr>(addr, peerMemoryAtomSize);
     int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
@@ -359,18 +365,9 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         // TODO: Fix this hack
-        bool found = false;
-        for (auto i : applyQueue) {
-            if (i == block_index) {
-                found = true;
-                break;
-            }
-        }
-        if (!found) {
-            applyQueue.push_back(block_index);
-            DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
-                    __func__, block_index, applyQueue.size());
-        }
+        applyQueue.push_back(block_index);
+        DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
+                __func__, block_index, applyQueue.size());
     }
 
     if ((!applyQueue.empty()) &&
@@ -395,15 +392,9 @@ CoalesceEngine::processNextApplyEvent()
         DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply "
                     "needed. Adding the cache line to evict schedule.\n",
                     __func__, block_index);
-        bool found = false;
-        for (auto i : evictQueue) {
-            if (i == block_index) {
-                found = true;
-                break;
-            }
-        }
-        if (!found) {
+        if (cacheBlocks[block_index].hasConflict) {
             evictQueue.push_back(block_index);
+            assert(!evictQueue.empty());
             DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
                     __func__, block_index, evictQueue.size());
         }
@@ -435,15 +426,7 @@ CoalesceEngine::processNextApplyEvent()
             }
         }
         // TODO: This is where eviction policy goes
-        // TODO: Fix this hack.
-        bool found = false;
-        for (auto i : evictQueue) {
-            if (i == block_index) {
-                found = true;
-                break;
-            }
-        }
-        if (!found) {
+        if (cacheBlocks[block_index].hasConflict){
             evictQueue.push_back(block_index);
             DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
                     __func__, block_index, evictQueue.size());
@@ -469,15 +452,8 @@ CoalesceEngine::processNextEvictEvent()
 {
     int block_index = evictQueue.front();
 
-    bool found_in_apply_queue = false;
-    for (auto i : applyQueue) {
-        if (i == block_index) {
-            found_in_apply_queue = true;
-            break;
-        }
-    }
     if ((cacheBlocks[block_index].busyMask) ||
-        (found_in_apply_queue)) {
+        (applyQueue.find(block_index))) {
         DPRINTF(MPU, "%s: cache line [%d] has been taken amid evict process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
@@ -514,8 +490,8 @@ CoalesceEngine::processNextEvictEvent()
                         " is Addr: %lu.\n", __func__, block_index, miss_addr);
 
                 Addr aligned_miss_addr =
-                    std::floor(miss_addr / peerMemoryAtomSize) *
-                    peerMemoryAtomSize;
+                    roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
+
                 PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
                                                         peerMemoryAtomSize);
                 DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 824faef10d..177bb067ab 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -84,9 +84,9 @@ class CoalesceEngine : public BaseMemEngine
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     bool pendingPushAlarm;
-    std::deque<int> applyQueue;
+    FIFOSet<int> applyQueue;
 
-    std::deque<int> evictQueue;
+    FIFOSet<int> evictQueue;
 
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index ae465f6eb1..c93b3b386d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -31,6 +31,7 @@
 
 #include "accl/graph/base/base_mem_engine.hh"
 #include "accl/graph/base/data_structs.hh"
+#include "base/intmath.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
@@ -59,7 +60,7 @@ class PushEngine : public BaseMemEngine
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
         {
             panic_if(done(), "Should not call nextPacketInfo when done.\n");
-            Addr aligned_addr = std::floor(_start / _atom) * _atom;
+            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
             Addr offset = _start - aligned_addr;
             int num_items = 0;
 

From e9c4b2e982425c29d348780c5d819a8b7893f377 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 13 Jun 2022 14:48:49 -0700
Subject: [PATCH 107/247] Moving delete pkt in push_engine.cc.

---
 src/accl/graph/sega/push_engine.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index ed23fb4d4b..cb71b73c60 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -228,8 +228,8 @@ PushEngine::processNextPushEvent()
         reqOffsetMap.erase(pkt->req);
         reqNumEdgeMap.erase(pkt->req);
         reqValueMap.erase(pkt->req);
-        delete pkt;
         memRespQueue.pop_front();
+        delete pkt;
     }
 
     if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {

From a07fba27ea6d0869853fe4db500680e4c62aeb9f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 19 Jun 2022 14:29:57 -0700
Subject: [PATCH 108/247] Enforced limited length on memRespQueue in
 PushEngine.

---
 configs/accl/sega.py                   | 15 +++++---
 src/accl/graph/SConscript              |  3 +-
 src/accl/graph/base/BaseMemEngine.py   |  2 ++
 src/accl/graph/base/base_mem_engine.cc | 49 +++++++++++++++++---------
 src/accl/graph/base/base_mem_engine.hh |  4 +++
 src/accl/graph/sega/coalesce_engine.cc |  5 ++-
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/push_engine.cc     |  5 +++
 src/accl/graph/sega/push_engine.hh     |  1 +
 src/accl/graph/sega/wl_engine.cc       | 12 +++++--
 10 files changed, 72 insertions(+), 25 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 11e2cfb6af..a5dd759f1f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,16 +9,21 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=64,
-                                    attached_memory_atom_size=64)
+                                    push_req_queue_size=1,
+                                    attached_memory_atom_size=64,
+                                    outstanding_mem_req_queue_size=1,
+                                    resp_queue_size=1)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
                                     cache_size="1MiB",
-                                    num_mshr_entry=16)
+                                    num_mshr_entry=1,
+                                    num_tgts_per_mshr=1,
+                                    outstanding_mem_req_queue_size=1,
+                                    resp_queue_size=1)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=32,
-                                on_the_fly_update_map_size=8)
+                                update_queue_size=1,
+                                on_the_fly_update_map_size=1)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 00fa2466dd..9663d3f263 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -27,4 +27,5 @@
 
 Import('*')
 
-DebugFlag('MPU')
\ No newline at end of file
+DebugFlag('MPU')
+DebugFlag('SEGAQSize')
diff --git a/src/accl/graph/base/BaseMemEngine.py b/src/accl/graph/base/BaseMemEngine.py
index 69f68e9dfc..2ecb6659d8 100644
--- a/src/accl/graph/base/BaseMemEngine.py
+++ b/src/accl/graph/base/BaseMemEngine.py
@@ -43,3 +43,5 @@ class BaseMemEngine(ClockedObject):
 
     attached_memory_atom_size = Param.Int(64, "The atom size of the attached "
                                     "memory.")
+
+    resp_queue_size = Param.Int(64, "blah")
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index f02f1d2feb..112b0d63cb 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -29,6 +29,8 @@
 #include "accl/graph/base/base_mem_engine.hh"
 
 #include "debug/MPU.hh"
+#include "debug/SEGAQSize.hh"
+
 namespace gem5
 {
 
@@ -37,6 +39,8 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     system(params.system),
     memPort(name() + ".mem_port", this),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
+    onTheFlyReqs(0),
+    respQueueSize(params.resp_queue_size),
     memAlarmRequested(false),
     memSpaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
@@ -73,7 +77,7 @@ bool
 BaseMemEngine::MemPort::recvTimingResp(PacketPtr pkt)
 {
     //TODO: Investigate sending true all the time
-    return owner->handleMemResp(pkt);
+    return owner->recvTimingResp(pkt);
 }
 
 void
@@ -98,20 +102,25 @@ BaseMemEngine::processNextMemReqEvent()
         return;
     }
 
-    // TODO: Maybe add a DPRINTF here.
-    PacketPtr pkt = outstandingMemReqQueue.front();
-    memPort.sendPacket(pkt);
-    DPRINTF(MPU, "%s: Sent a packet to memory with the following info. "
-                "pkt->addr: %lu, pkt->size: %lu.\n",
-                __func__, pkt->getAddr(), pkt->getSize());
-    outstandingMemReqQueue.pop_front();
-
-    if (memAlarmRequested &&
-        (outstandingMemReqQueue.size() <=
-        (outstandingMemReqQueueSize - memSpaceRequested))) {
-        memAlarmRequested = false;
-        memSpaceRequested = 0;
-        respondToMemAlarm();
+    if ((respBuffSize() == -1) ||
+        ((respBuffSize() + onTheFlyReqs) < respQueueSize)) {
+        PacketPtr pkt = outstandingMemReqQueue.front();
+        memPort.sendPacket(pkt);
+        onTheFlyReqs++;
+        DPRINTF(MPU, "%s: Sent a packet to memory with the following info. "
+                    "pkt->addr: %lu, pkt->size: %lu.\n",
+                    __func__, pkt->getAddr(), pkt->getSize());
+        outstandingMemReqQueue.pop_front();
+        DPRINTF(SEGAQSize, "%s: outstandingMemReqQueue.size: %lu.\n",
+                    __func__, outstandingMemReqQueue.size());
+
+        if (memAlarmRequested &&
+            (outstandingMemReqQueue.size() <=
+            (outstandingMemReqQueueSize - memSpaceRequested))) {
+            memAlarmRequested = false;
+            memSpaceRequested = 0;
+            respondToMemAlarm();
+        }
     }
 
     if ((!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
@@ -171,7 +180,8 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 {
     panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n");
     outstandingMemReqQueue.push_back(pkt);
-
+    DPRINTF(SEGAQSize, "%s: outstandingMemReqQueue.size: %lu.\n",
+                    __func__, outstandingMemReqQueue.size());
     assert(!outstandingMemReqQueue.empty());
     if (!nextMemReqEvent.scheduled()) {
         schedule(nextMemReqEvent, nextCycle());
@@ -197,4 +207,11 @@ BaseMemEngine::wakeUp()
     }
 }
 
+bool
+BaseMemEngine::recvTimingResp(PacketPtr pkt)
+{
+    onTheFlyReqs--;
+    return handleMemResp(pkt);
+}
+
 }
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
index 8a18807e2e..fc67f3f6d8 100644
--- a/src/accl/graph/base/base_mem_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -69,6 +69,8 @@ class BaseMemEngine : public ClockedObject
     MemPort memPort;
 
     int outstandingMemReqQueueSize;
+    int onTheFlyReqs;
+    int respQueueSize;
     bool memAlarmRequested;
     int memSpaceRequested;
     std::deque<PacketPtr> outstandingMemReqQueue;
@@ -90,6 +92,7 @@ class BaseMemEngine : public ClockedObject
     void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
     void enqueueMemReq(PacketPtr pkt);
 
+    virtual int respBuffSize() = 0;
     virtual void respondToMemAlarm() = 0;
     virtual bool handleMemResp(PacketPtr pkt) = 0;
 
@@ -109,6 +112,7 @@ class BaseMemEngine : public ClockedObject
 
     AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); }
 
+    bool recvTimingResp(PacketPtr pkt);
     void recvFunctional(PacketPtr pkt);
 
     void wakeUp();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index f96adbf8d8..ee1e3f85ff 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -141,11 +141,14 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
                     DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
-                                "line[%d]", __func__, addr, block_index);
+                                "line[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
                     stats.numVertexReads++;
                     if (!cacheBlocks[block_index].busyMask) {
                         applyQueue.push_back(block_index);
+                        DPRINTF(MPU, "%s: Added %d to applyQueue. "
+                                    "applyQueue.size = %u.\n", __func__,
+                                    block_index, applyQueue.size());
                         assert(!applyQueue.empty());
                         if ((!nextApplyEvent.scheduled()) &&
                             (!pendingPushAlarm)) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 177bb067ab..1e353c11b8 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -118,6 +118,7 @@ class CoalesceEngine : public BaseMemEngine
     CoalesceStats stats;
 
   protected:
+    virtual int respBuffSize() { return -1; }
     virtual void respondToMemAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index cb71b73c60..a045bbdead 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -93,6 +93,11 @@ PushEngine::ReqPort::recvReqRetry()
     sendPacket(blockedPacket);
 
     if (!_blocked) {
+        DPRINTF(MPU, "%s: Sent the blockedPacket. "
+                    "_blocked: %s, (blockedPacket == nullptr): %s.\n",
+                    __func__, _blocked ? "true" : "false",
+                    (blockedPacket == nullptr) ? "true" : "false");
+
         blockedPacket = nullptr;
     }
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index c93b3b386d..2c17501d5b 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -140,6 +140,7 @@ class PushEngine : public BaseMemEngine
     PushStats stats;
 
   protected:
+    virtual int respBuffSize() { return memRespQueue.size(); }
     virtual void respondToMemAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 75ac4f784e..55a9147ac9 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -136,6 +136,9 @@ WLEngine::processNextReadEvent()
         DPRINTF(MPU, "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
                     __func__, update_addr);
         if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
+            DPRINTF(MPU, "%s: Entry available in onTheFlyUpdateMap. "
+                        "onTheFlyUpdateMap.size: %lu.\n",
+                        __func__, onTheFlyUpdateMap.size());
             if (coalesceEngine->recvReadAddr(update_addr)) {
                 onTheFlyUpdateMap[update_addr] = update_value;
                 DPRINTF(MPU, "%s: Added a new item to onTheFlyUpdateMap. "
@@ -147,6 +150,10 @@ WLEngine::processNextReadEvent()
                             __func__, updateQueue.size());
                 respPort.checkRetryReq();
             }
+        } else {
+            DPRINTF(MPU, "%s: No entries available in onTheFlyUpdateMap. "
+                        "onTheFlyUpdateMap.size: %lu.\n", __func__,
+                        onTheFlyUpdateMap.size());
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
@@ -209,8 +216,9 @@ WLEngine::processNextReduceEvent()
 
         coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]);
         onTheFlyUpdateMap.erase(addr);
-        DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
-                    __func__, addr);
+        DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap. "
+                    "onTheFlyUpdateMap.size: %lu.\n",
+                    __func__, addr, onTheFlyUpdateMap.size());
     }
     addrWorkListMap.clear();
 }

From dd056de8c00f33db13d14350910c5de8d6908c19 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 8 Jul 2022 10:36:46 -0700
Subject: [PATCH 109/247] Adding bit vector implementation for caching push
 meta data.

---
 configs/accl/sega.py                   |   7 +-
 src/accl/graph/base/base_mem_engine.cc |  10 +-
 src/accl/graph/base/data_structs.hh    |  86 +++++++++-------
 src/accl/graph/sega/CoalesceEngine.py  |   3 +
 src/accl/graph/sega/coalesce_engine.cc | 137 ++++++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh |  14 ++-
 src/accl/graph/sega/push_engine.cc     |  62 ++++++++---
 src/accl/graph/sega/push_engine.hh     |  12 ++-
 8 files changed, 227 insertions(+), 104 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index a5dd759f1f..96408aa185 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,7 +9,7 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=1,
+                                    push_req_queue_size=0,
                                     attached_memory_atom_size=64,
                                     outstanding_mem_req_queue_size=1,
                                     resp_queue_size=1)
@@ -19,8 +19,7 @@ def __init__(self, base_edge_addr):
                                     cache_size="1MiB",
                                     num_mshr_entry=1,
                                     num_tgts_per_mshr=1,
-                                    outstanding_mem_req_queue_size=1,
-                                    resp_queue_size=1)
+                                    outstanding_mem_req_queue_size=2)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=1,
                                 on_the_fly_update_map_size=1)
@@ -77,7 +76,7 @@ def __init__(self,
             )
             edge_mem_ctrl.append(
                 SimpleMemory(range=self._edge_ranges[i],
-                            bandwidth="19.2GB/s",
+                            bandwidth="4.8GB/s",
                             latency="30ns",
                             image_file=f"{graph_path}/edgelist_{i}")
             )
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index 112b0d63cb..3086b81fc2 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -29,7 +29,6 @@
 #include "accl/graph/base/base_mem_engine.hh"
 
 #include "debug/MPU.hh"
-#include "debug/SEGAQSize.hh"
 
 namespace gem5
 {
@@ -102,8 +101,8 @@ BaseMemEngine::processNextMemReqEvent()
         return;
     }
 
-    if ((respBuffSize() == -1) ||
-        ((respBuffSize() + onTheFlyReqs) < respQueueSize)) {
+    if (((respBuffSize() + onTheFlyReqs) < respQueueSize) ||
+        (respQueueSize == 0)) {
         PacketPtr pkt = outstandingMemReqQueue.front();
         memPort.sendPacket(pkt);
         onTheFlyReqs++;
@@ -111,8 +110,6 @@ BaseMemEngine::processNextMemReqEvent()
                     "pkt->addr: %lu, pkt->size: %lu.\n",
                     __func__, pkt->getAddr(), pkt->getSize());
         outstandingMemReqQueue.pop_front();
-        DPRINTF(SEGAQSize, "%s: outstandingMemReqQueue.size: %lu.\n",
-                    __func__, outstandingMemReqQueue.size());
 
         if (memAlarmRequested &&
             (outstandingMemReqQueue.size() <=
@@ -180,8 +177,7 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 {
     panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n");
     outstandingMemReqQueue.push_back(pkt);
-    DPRINTF(SEGAQSize, "%s: outstandingMemReqQueue.size: %lu.\n",
-                    __func__, outstandingMemReqQueue.size());
+
     assert(!outstandingMemReqQueue.empty());
     if (!nextMemReqEvent.scheduled()) {
         schedule(nextMemReqEvent, nextCycle());
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index e03686a7e9..e30d6029cb 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -32,8 +32,9 @@
 #include "base/cprintf.hh"
 #include "base/intmath.hh"
 
-#include <unordered_set>
+#include <bitset>
 #include <queue>
+#include <unordered_set>
 
 namespace gem5
 {
@@ -91,49 +92,64 @@ static_assert(isPowerOf2(sizeof(Edge)));
 template<typename T>
 class FIFOSet
 {
-    private:
-        std::queue<T> fifo;
-        std::unordered_set<T> set;
-
-    public:
-        FIFOSet(int cap)
-        {
-            set.reserve(cap);
-        }
+  private:
+    std::queue<T> fifo;
+    std::unordered_set<T> set;
 
-        void push_back(T item)
-        {
-            if (set.find(item) == set.end()) {
-                set.insert(item);
-                fifo.push(item);
-            }
-        }
+  public:
+    FIFOSet(int cap)
+    {
+        set.reserve(cap);
+    }
 
-        void pop_front()
-        {
-            T front = fifo.front();
-            set.erase(front);
-            fifo.pop();
+    void push_back(T item)
+    {
+        if (set.find(item) == set.end()) {
+            set.insert(item);
+            fifo.push(item);
         }
+    }
 
-        T& front()
-        {
-            return fifo.front();
-        }
+    void pop_front()
+    {
+        T front = fifo.front();
+        set.erase(front);
+        fifo.pop();
+    }
 
-        size_t size() {
-            return fifo.size();
-        }
+    T& front()
+    {
+        return fifo.front();
+    }
 
-        bool empty() {
-            return fifo.empty();
-        }
+    size_t size() {
+        return fifo.size();
+    }
 
-        bool find(T item) {
-            return (set.find(item) != set.end());
-        }
+    bool empty() {
+        return fifo.empty();
+    }
+
+    bool find(T item) {
+        return (set.find(item) != set.end());
+    }
 };
 
+// template<int SIZE>
+// class BitVector
+// {
+//   private:
+//     int it;
+//     std::bitset<SIZE> bitStore;
+
+//   public:
+//     BitVector(): it(0) { bitStore.reset(); }
+
+//     uint32_t next() {
+
+//     }
+// };
+
 }
 
 #endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 086f284950..7667a22c5a 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -41,4 +41,7 @@ class CoalesceEngine(BaseMemEngine):
     num_mshr_entry = Param.Int(4, "")
     num_tgts_per_mshr = Param.Int(20, "")
 
+    # Don't change. If changed. It will break functionality of coalesce.
+    resp_queue_size = 0
+
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index ee1e3f85ff..b5eeae694e 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -44,7 +44,6 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
-    pendingPushAlarm(false),
     applyQueue(numLines),
     evictQueue(numLines),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
@@ -58,7 +57,9 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
         cacheBlocks[i] = Block(numElementsPerLine);
     }
 
-    peerPushEngine->registerCoalesceEngine(this);
+    peerPushEngine->registerCoalesceEngine(this, numElementsPerLine);
+
+    needsApply.reset();
 }
 
 void
@@ -67,6 +68,38 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
     sendMemFunctional(pkt);
 }
 
+void
+CoalesceEngine::startup()
+{
+    AddrRangeList vertex_ranges = getAddrRanges();
+
+    bool found = false;
+    Addr first_match_addr = 0;
+    while(!found) {
+        for (auto range: vertex_ranges) {
+            if (range.contains(first_match_addr)) {
+                found = true;
+                break;
+            }
+        }
+        first_match_addr += peerMemoryAtomSize;
+    }
+
+    found = false;
+    Addr second_match_addr = first_match_addr + peerMemoryAtomSize;
+    while(!found) {
+        for (auto range: vertex_ranges) {
+            if (range.contains(second_match_addr)) {
+                found = true;
+                break;
+            }
+        }
+        second_match_addr += peerMemoryAtomSize;
+    }
+
+    nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
+}
+
 void
 CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
 {
@@ -150,8 +183,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                                     "applyQueue.size = %u.\n", __func__,
                                     block_index, applyQueue.size());
                         assert(!applyQueue.empty());
-                        if ((!nextApplyEvent.scheduled()) &&
-                            (!pendingPushAlarm)) {
+                        if ((!nextApplyEvent.scheduled())) {
                             schedule(nextApplyEvent, nextCycle());
                         }
                     }
@@ -363,18 +395,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
-    if ((cacheBlocks[block_index].busyMask == 0)) {
+    if ((cacheBlocks[block_index].busyMask == 0)) {(aligned_addr / peerMemoryAtomSize) % numLines;
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
-        // TODO: Fix this hack
         applyQueue.push_back(block_index);
         DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
                 __func__, block_index, applyQueue.size());
     }
 
     if ((!applyQueue.empty()) &&
-        (!pendingPushAlarm) &&
         (!nextApplyEvent.scheduled())) {
         schedule(nextApplyEvent, nextCycle());
     }
@@ -393,14 +423,7 @@ CoalesceEngine::processNextApplyEvent()
         stats.falseApplySchedules++;
     } else if (!cacheBlocks[block_index].dirty) {
         DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply "
-                    "needed. Adding the cache line to evict schedule.\n",
-                    __func__, block_index);
-        if (cacheBlocks[block_index].hasConflict) {
-            evictQueue.push_back(block_index);
-            assert(!evictQueue.empty());
-            DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
-                    __func__, block_index, evictQueue.size());
-        }
+                    "needed.\n", __func__, block_index);
     } else {
         for (int i = 0; i < numElementsPerLine; i++) {
             uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
@@ -409,31 +432,38 @@ CoalesceEngine::processNextApplyEvent()
                                 cacheBlocks[block_index].items[i].tempProp);
 
             if (new_prop != old_prop) {
-                if (peerPushEngine->allocatePushSpace()) {
-                    cacheBlocks[block_index].items[i].tempProp = new_prop;
-                    cacheBlocks[block_index].items[i].prop = new_prop;
-                    DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n",
-                    __func__,
+                cacheBlocks[block_index].items[i].tempProp = new_prop;
+                cacheBlocks[block_index].items[i].prop = new_prop;
+                DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n", __func__,
                     cacheBlocks[block_index].addr + (i  * sizeof(WorkListItem)),
                     cacheBlocks[block_index].items[i].to_string());
-                    peerPushEngine->recvWLItem(
-                                        cacheBlocks[block_index].items[i]);
-                    DPRINTF(MPU, "%s: Sent WorkListItem [%d] to PushEngine.\n",
-                    __func__,
-                    cacheBlocks[block_index].addr + i * sizeof(WorkListItem));
+
+                Addr block_addr = cacheBlocks[block_index].addr;
+                int atom_index = (int) (block_addr / (peerMemoryAtomSize * nmpu));
+                int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
+                int bit_index = atom_index * block_bits + i;
+
+                if (needsApply[bit_index] == 1) {
+                    DPRINTF(MPU, "%s: WorkListItem[%lu] already set in bit-vector."
+                                " Not doing anything further.\n", __func__,
+                                block_addr + (i * sizeof(WorkListItem)));
                 } else {
-                    peerPushEngine->setPushAlarm();
-                    pendingPushAlarm = true;
-                    return;
+                    if (peerPushEngine->allocatePushSpace()) {
+                        peerPushEngine->recvWLItem(
+                            cacheBlocks[block_index].items[i]);
+                    } else {
+                        needsApply[bit_index] = 1;
+                    }
                 }
             }
         }
-        // TODO: This is where eviction policy goes
-        if (cacheBlocks[block_index].hasConflict){
-            evictQueue.push_back(block_index);
-            DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
-                    __func__, block_index, evictQueue.size());
-        }
+    }
+
+    // TODO: This is where eviction policy goes
+    if (cacheBlocks[block_index].hasConflict){
+        evictQueue.push_back(block_index);
+        DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
+                __func__, block_index, evictQueue.size());
     }
 
     applyQueue.pop_front();
@@ -536,9 +566,42 @@ CoalesceEngine::processNextEvictEvent()
 void
 CoalesceEngine::respondToPushAlarm()
 {
-    assert(pendingPushAlarm && (!nextApplyEvent.scheduled()));
-    pendingPushAlarm = false;
-    schedule(nextApplyEvent, nextCycle());
+    DPRINTF(MPU, "%s: Received a Push alarm.\n", __func__);
+    int it;
+    for (it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
+        uint32_t slice = 0;
+        for (int i = 0; i < numElementsPerLine; i++) {
+            slice <<= 1;
+            slice |= needsApply[it + i];
+        }
+        if (slice) {
+            break;
+        }
+    }
+    DPRINTF(MPU, "%s: Found slice %u at %d position in needsApply.\n",
+                __func__, slice, it);
+
+    Addr block_addr = (nmpu * peerMemoryAtomSize) *
+                ((int)(it / (peerMemoryAtomSize / sizeof(WorkListItem))));
+    int block_index = ((int) (block_addr / peerMemoryAtomSize)) % numLines;
+
+    if ((cacheBlocks[block_index].addr == block_addr) &&
+        (cacheBlocks[block_index].valid)) {
+        // hit in cache
+        bool do_push = cacheBlocks[block_index].busyMask == 0 ? true : false;
+        for (int i = 0; i < numElementsPerLine; i++) {
+            peerPushEngine->recvWLItemRetry(
+                        cacheBlocks[block_index].items[i], do_push);
+        }
+
+        // TODO: Should we add block_index to evict_queue?
+        if (do_push && cacheBlocks[block_index].hasConflict) {
+            evictQueue.push_back(block_index);
+        }
+    } else {
+        PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
+
+    }
 }
 
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 1e353c11b8..e6c70502af 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -29,12 +29,16 @@
 #ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
+#include <bitset>
+
 #include "accl/graph/base/base_mem_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
+#define MAX_BITVECTOR_SIZE (1 << 30)
+
 // TODO: Add parameters for size, memory atom size, type size,
 // length of items in the blocks.
 namespace gem5
@@ -68,6 +72,7 @@ class CoalesceEngine : public BaseMemEngine
           items = new WorkListItem [num_elements];
         }
     };
+    int nmpu;
 
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
@@ -83,8 +88,9 @@ class CoalesceEngine : public BaseMemEngine
 
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
-    bool pendingPushAlarm;
     FIFOSet<int> applyQueue;
+    int needsApplyFirstPointer;
+    std::bitset<MAX_BITVECTOR_SIZE> needsApply;
 
     FIFOSet<int> evictQueue;
 
@@ -127,14 +133,16 @@ class CoalesceEngine : public BaseMemEngine
 
     CoalesceEngine(const CoalesceEngineParams &params);
 
-    void recvFunctional(PacketPtr pkt);
-
     bool recvReadAddr(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
     void registerWLEngine(WLEngine* wl_engine);
 
     void respondToPushAlarm();
+
+    void recvFunctional(PacketPtr pkt);
+
+    virtual void startup();
 };
 
 }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a045bbdead..8bc2d55a28 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -30,6 +30,7 @@
 
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "debug/MPU.hh"
+#include "debug/SEGAQSize.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -37,9 +38,10 @@ namespace gem5
 
 PushEngine::PushEngine(const PushEngineParams &params):
     BaseMemEngine(params),
-    pushAlarmSet(false),
+    retrySpaceAllocated(0),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
+    numRetries(0),
     pushReqQueueSize(params.push_req_queue_size),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
@@ -59,9 +61,11 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 }
 
 void
-PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine)
+PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine,
+                                    int elements_per_line)
 {
     peerCoalesceEngine = coalesce_engine;
+    numElementsPerLine = elements_per_line;
 }
 
 void
@@ -115,15 +119,21 @@ PushEngine::recvWLItem(WorkListItem wl)
 
     assert((pushReqQueueSize == 0) ||
         (pushReqQueue.size() < pushReqQueueSize));
-    panic_if(pushReqQueue.size() == pushReqQueueSize, "You should call this "
-                "method after checking if there is enough push space. Use "
-                "allocatePushSpace.\n");
+    panic_if((pushReqQueue.size() == pushReqQueueSize) &&
+            (pushReqQueueSize != 0), "You should call this method after "
+            "checking if there is enough push space. Use allocatePushSpace.\n");
 
     Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
     uint32_t value = wl.prop;
 
-    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), peerMemoryAtomSize, value);
+    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                                    peerMemoryAtomSize, value);
+
+    if (curTick() % 50000 == 0) {
+        DPRINTF(SEGAQSize, "%s: pushReqQueue.size: %lu.\n",
+                                __func__, pushReqQueue.size());
+    }
 
     assert(!pushReqQueue.empty());
     if ((!nextAddrGenEvent.scheduled()) &&
@@ -132,6 +142,25 @@ PushEngine::recvWLItem(WorkListItem wl)
     }
 }
 
+void
+PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push)
+{
+    if (do_push) {
+        Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
+        Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
+        uint32_t value = wl.prop;
+
+        pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                                        peerMemoryAtomSize, value);
+        numRetries--;
+    }
+    retrySpaceAllocated--;
+    if ((!nextAddrGenEvent.scheduled()) &&
+        (!memReqQueueFull())) {
+        schedule(nextAddrGenEvent, nextCycle());
+    }
+}
+
 void
 PushEngine::processNextAddrGenEvent()
 {
@@ -158,8 +187,10 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(MPU, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
-        if (pushAlarmSet && (pushReqQueue.size() == pushReqQueueSize - 1)) {
-            pushAlarmSet = false;
+        if (numRetries > 0) {
+            retrySpaceAllocated++;
+        }
+        if ((retrySpaceAllocated % numElementsPerLine) == 0) {
             peerCoalesceEngine->respondToPushAlarm();
         }
     }
@@ -261,17 +292,20 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
-void
-PushEngine::setPushAlarm()
-{
-    assert(!pushAlarmSet);
-    pushAlarmSet = true;
+bool
+PushEngine::allocatePushSpace() {
+    if ((pushReqQueueSize == 0) ||
+        ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
+        return true;
+    } else {
+        numRetries++;
+        return false;
+    }
 }
 
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
-
     ADD_STAT(numUpdates, statistics::units::Count::get(),
              "Number of sent updates.")
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 2c17501d5b..4f388cd7e6 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -98,13 +98,15 @@ class PushEngine : public BaseMemEngine
         virtual void recvReqRetry();
     };
 
-    bool pushAlarmSet;
+    int numElementsPerLine;
+    int retrySpaceAllocated;
     CoalesceEngine* peerCoalesceEngine;
 
     ReqPort reqPort;
 
     Addr baseEdgeAddr;
 
+    int numRetries;
     int pushReqQueueSize;
     std::deque<PushPacketInfoGen> pushReqQueue;
 
@@ -151,12 +153,14 @@ class PushEngine : public BaseMemEngine
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
-    bool allocatePushSpace() { return pushReqQueue.size() < pushReqQueueSize; }
+    bool allocatePushSpace();
+
     void recvWLItem(WorkListItem wl);
 
-    void registerCoalesceEngine(CoalesceEngine* coalesce_engine);
+    void recvWLItemRetry(WorkListItem wl, bool do_push);
 
-    void setPushAlarm();
+    void registerCoalesceEngine(CoalesceEngine* coalesce_engine,
+                                          int elements_per_line);
 };
 
 }

From 7a351854013b45cfe260990b60dbc160e1aac24a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 17 Jul 2022 16:12:07 -0700
Subject: [PATCH 110/247] Completing retry between coalesce and push engine.

---
 configs/accl/sega.py                   |   4 +-
 src/accl/graph/SConscript              |   1 +
 src/accl/graph/TODO.md                 |   7 +-
 src/accl/graph/base/base_mem_engine.cc |  13 ++-
 src/accl/graph/base/data_structs.hh    |   3 +-
 src/accl/graph/sega/coalesce_engine.cc | 155 +++++++++++++++++++------
 src/accl/graph/sega/coalesce_engine.hh |  12 ++
 src/accl/graph/sega/push_engine.cc     |  11 +-
 8 files changed, 157 insertions(+), 49 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 96408aa185..65645b3bb3 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,7 +9,7 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=0,
+                                    push_req_queue_size=4,
                                     attached_memory_atom_size=64,
                                     outstanding_mem_req_queue_size=1,
                                     resp_queue_size=1)
@@ -19,7 +19,7 @@ def __init__(self, base_edge_addr):
                                     cache_size="1MiB",
                                     num_mshr_entry=1,
                                     num_tgts_per_mshr=1,
-                                    outstanding_mem_req_queue_size=2)
+                                    outstanding_mem_req_queue_size=0)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=1,
                                 on_the_fly_update_map_size=1)
diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 9663d3f263..36e16affa3 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -29,3 +29,4 @@ Import('*')
 
 DebugFlag('MPU')
 DebugFlag('SEGAQSize')
+DebugFlag('MahyarMath')
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index 29b5a2939e..ebfca7e794 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -1,5 +1,8 @@
 # TODO Items
 
-* Replace std::floor with roundDown from intmath.hh in src
 * We might need to revisit the fact that we could insert something to a queue on
-    the same cycle that another event is consuming something from the queue.
\ No newline at end of file
+    the same cycle that another event is consuming something from the queue.
+* Move checking for wl.degree == 0 to coalesce engine.
+* Fix the retry system between memory queue and coalesce engine
+* Update inheritance: There is not enough reason for PushEngine and
+CoalesceEngine to be of the same type (i.e. delete BaseMemEngine).
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index 3086b81fc2..64aaa3a737 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -159,17 +159,22 @@ BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 bool
 BaseMemEngine::allocateMemReqSpace(int space)
 {
-    assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
+    assert((outstandingMemReqQueueSize == 0) ||
+        (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
     return (
-        outstandingMemReqQueue.size() <= (outstandingMemReqQueueSize - space)
+        (outstandingMemReqQueueSize == 0) ||
+        (outstandingMemReqQueue.size() <= (outstandingMemReqQueueSize - space))
         );
 }
 
 bool
 BaseMemEngine::memReqQueueFull()
 {
-    assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
-    return (outstandingMemReqQueue.size() == outstandingMemReqQueueSize);
+    assert((outstandingMemReqQueueSize == 0) ||
+        (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
+    return (
+        (outstandingMemReqQueueSize != 0) &&
+        (outstandingMemReqQueue.size() == outstandingMemReqQueueSize));
 }
 
 void
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index e30d6029cb..9c250c6a2f 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -93,6 +93,7 @@ template<typename T>
 class FIFOSet
 {
   private:
+    // int numInvalids;
     std::queue<T> fifo;
     std::unordered_set<T> set;
 
@@ -127,7 +128,7 @@ class FIFOSet
     }
 
     bool empty() {
-        return fifo.empty();
+        return (size() == 0);
     }
 
     bool find(T item) {
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b5eeae694e..1c3f2bcadf 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -31,6 +31,7 @@
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
+#include "debug/MahyarMath.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
@@ -75,29 +76,39 @@ CoalesceEngine::startup()
 
     bool found = false;
     Addr first_match_addr = 0;
-    while(!found) {
+    while(true) {
         for (auto range: vertex_ranges) {
             if (range.contains(first_match_addr)) {
                 found = true;
                 break;
             }
         }
+        if (found) {
+            break;
+        }
         first_match_addr += peerMemoryAtomSize;
     }
 
     found = false;
     Addr second_match_addr = first_match_addr + peerMemoryAtomSize;
-    while(!found) {
+    while(true) {
         for (auto range: vertex_ranges) {
             if (range.contains(second_match_addr)) {
                 found = true;
                 break;
             }
         }
+        if (found) {
+            break;
+        }
         second_match_addr += peerMemoryAtomSize;
     }
 
     nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
+    memoryAddressOffset = first_match_addr;
+    DPRINTF(MahyarMath, "%s: Initialized address translation information."
+                        " nmpu: %d, memoryAddressOffset: %lu.\n",
+                        __func__, nmpu, memoryAddressOffset);
 }
 
 void
@@ -106,6 +117,40 @@ CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
     peerWLEngine = wl_engine;
 }
 
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBlockIndex(Addr addr)
+{
+    return ((int) (addr / peerMemoryAtomSize)) % numLines;
+}
+
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBitIndexBase(Addr addr)
+{
+    DPRINTF(MahyarMath, "%s: Calculating BitIndexBase for addr %lu.\n",
+                        __func__, addr);
+    int atom_index = (int) (addr / (peerMemoryAtomSize * nmpu));
+    int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
+    int bit_index = atom_index * block_bits;
+    DPRINTF(MahyarMath, "%s: BitIndexBase for addr %lu is %d.\n",
+                        __func__, addr, bit_index);
+    return bit_index;
+}
+
+// index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
+Addr
+CoalesceEngine::getBlockAddrFromBitIndex(int index)
+{
+    DPRINTF(MahyarMath, "%s: Calculating BlockAddr for index %d.\n",
+                        __func__, index);
+    Addr block_addr = (nmpu * peerMemoryAtomSize) *
+        ((int)(index / (peerMemoryAtomSize / sizeof(WorkListItem))));
+    DPRINTF(MahyarMath, "%s: BlockAddr for index %d is %lu.\n",
+                        __func__, index, (block_addr + memoryAddressOffset));
+    return (block_addr + memoryAddressOffset);
+}
+
 bool
 CoalesceEngine::recvReadAddr(Addr addr)
 {
@@ -298,6 +343,31 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         return true;
     }
 
+    if (pkt->findNextSenderState<SenderState>()) {
+        Addr addr = pkt->getAddr();
+        int it = getBitIndexBase(addr);
+        int block_index = getBlockIndex(addr);
+        bool found_in_cache = (cacheBlocks[block_index].addr == addr);
+
+        // We have to send the items regardless of them being found in the
+        // cache. However, if they are found in the cache, two things should
+        // happen. First, do_push should be set to false and the bit vector
+        // value for the items should not change. To future Mahyar and Marjan,
+        // If this is confusing, please look at where each item is pushed to
+        // the apply queue. Hint: Think about updates that might not be sent
+        // out if you reset the bit regardless of the line being found in the
+        // cache.
+        WorkListItem* items = pkt->getPtr<WorkListItem>();
+        for (int i = 0; i < numElementsPerLine; i++) {
+            needsApply[it + i] =
+                (needsApply[it + i] == 1) && found_in_cache ? 1 : 0;
+
+            peerPushEngine->recvWLItemRetry(items[i],
+                ((!found_in_cache) && needsApply[it + i]));
+        }
+        return true;
+    }
+
     Addr addr = pkt->getAddr();
     int block_index = (addr / peerMemoryAtomSize) % numLines;
 
@@ -395,11 +465,15 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
-    if ((cacheBlocks[block_index].busyMask == 0)) {(aligned_addr / peerMemoryAtomSize) % numLines;
+    if ((cacheBlocks[block_index].busyMask == 0)) {
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         applyQueue.push_back(block_index);
+        int bit_index = getBitIndexBase(cacheBlocks[block_index].addr);
+        for (int i = 0; i < numElementsPerLine; i++) {
+            needsApply[bit_index + i] = 0;
+        }
         DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
                 __func__, block_index, applyQueue.size());
     }
@@ -438,22 +512,15 @@ CoalesceEngine::processNextApplyEvent()
                     cacheBlocks[block_index].addr + (i  * sizeof(WorkListItem)),
                     cacheBlocks[block_index].items[i].to_string());
 
-                Addr block_addr = cacheBlocks[block_index].addr;
-                int atom_index = (int) (block_addr / (peerMemoryAtomSize * nmpu));
-                int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
-                int bit_index = atom_index * block_bits + i;
+                int bit_index =
+                        getBitIndexBase(cacheBlocks[block_index].addr) + i;
 
-                if (needsApply[bit_index] == 1) {
-                    DPRINTF(MPU, "%s: WorkListItem[%lu] already set in bit-vector."
-                                " Not doing anything further.\n", __func__,
-                                block_addr + (i * sizeof(WorkListItem)));
+                assert(needsApply[bit_index] == 0);
+                if (peerPushEngine->allocatePushSpace()) {
+                    peerPushEngine->recvWLItem(
+                        cacheBlocks[block_index].items[i]);
                 } else {
-                    if (peerPushEngine->allocatePushSpace()) {
-                        peerPushEngine->recvWLItem(
-                            cacheBlocks[block_index].items[i]);
-                    } else {
-                        needsApply[bit_index] = 1;
-                    }
+                    needsApply[bit_index] = 1;
                 }
             }
         }
@@ -567,40 +634,56 @@ void
 CoalesceEngine::respondToPushAlarm()
 {
     DPRINTF(MPU, "%s: Received a Push alarm.\n", __func__);
-    int it;
+    Addr block_addr = 0;
+    int block_index = 0;
+    int it = 0;
+    uint32_t slice = 0;
+    bool hit_in_cache = false;
     for (it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
-        uint32_t slice = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
             slice <<= 1;
             slice |= needsApply[it + i];
         }
         if (slice) {
-            break;
+            block_addr = getBlockAddrFromBitIndex(it);
+            block_index = ((int) (block_addr / peerMemoryAtomSize)) % numLines;
+            if ((cacheBlocks[block_index].addr == block_addr) &&
+                (cacheBlocks[block_index].valid)) {
+                if (cacheBlocks[block_index].busyMask == 0) {
+                    hit_in_cache = true;
+                    break;
+                }
+            } else {
+                hit_in_cache = false;
+                break;
+            }
         }
     }
+
+    assert(it < MAX_BITVECTOR_SIZE);
+
     DPRINTF(MPU, "%s: Found slice %u at %d position in needsApply.\n",
                 __func__, slice, it);
 
-    Addr block_addr = (nmpu * peerMemoryAtomSize) *
-                ((int)(it / (peerMemoryAtomSize / sizeof(WorkListItem))));
-    int block_index = ((int) (block_addr / peerMemoryAtomSize)) % numLines;
-
-    if ((cacheBlocks[block_index].addr == block_addr) &&
-        (cacheBlocks[block_index].valid)) {
-        // hit in cache
-        bool do_push = cacheBlocks[block_index].busyMask == 0 ? true : false;
+    if (hit_in_cache) {
         for (int i = 0; i < numElementsPerLine; i++) {
-            peerPushEngine->recvWLItemRetry(
-                        cacheBlocks[block_index].items[i], do_push);
-        }
-
-        // TODO: Should we add block_index to evict_queue?
-        if (do_push && cacheBlocks[block_index].hasConflict) {
-            evictQueue.push_back(block_index);
+            peerPushEngine->recvWLItemRetry(cacheBlocks[block_index].items[i],
+                                                    (needsApply[it + i] == 1));
+            needsApply[it + i] = 0;
         }
     } else {
+        // FIXME: Fix the retry mechanism between memory and cache to
+        // handle memory retries correctly. This probably requires scheduling
+        // an event for sending the retry. For now we're enabling infinite
+        // queueing in the outstandingMemReqQueue.
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
-
+        SenderState* sender_state = new SenderState(true);
+        pkt->pushSenderState(sender_state);
+        if (allocateMemReqSpace(1)) {
+            enqueueMemReq(pkt);
+        } else {
+            requestMemAlarm(1);
+        }
     }
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index e6c70502af..973ea479c1 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -72,7 +72,15 @@ class CoalesceEngine : public BaseMemEngine
           items = new WorkListItem [num_elements];
         }
     };
+
+    struct SenderState : public Packet::SenderState
+    {
+      bool isRetry;
+      SenderState(bool is_retry): isRetry(is_retry) {}
+    };
+
     int nmpu;
+    Addr memoryAddressOffset;
 
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
@@ -94,6 +102,10 @@ class CoalesceEngine : public BaseMemEngine
 
     FIFOSet<int> evictQueue;
 
+    int getBlockIndex(Addr addr);
+    int getBitIndexBase(Addr addr);
+    Addr getBlockAddrFromBitIndex(int index);
+
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 8bc2d55a28..fa611392b4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -149,9 +149,13 @@ PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push)
         Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
         Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
         uint32_t value = wl.prop;
-
-        pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                        peerMemoryAtomSize, value);
+        if (wl.degree != 0) {
+            pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                                            peerMemoryAtomSize, value);
+        } else {
+            DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n",
+                    __func__, wl.to_string());
+        }
         numRetries--;
     }
     retrySpaceAllocated--;
@@ -164,7 +168,6 @@ PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push)
 void
 PushEngine::processNextAddrGenEvent()
 {
-
     Addr aligned_addr, offset;
     int num_edges;
 

From 2b9604dc53c675f1e4fc943c162e43929ff0af27 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 19 Jul 2022 07:33:16 -0700
Subject: [PATCH 111/247] Updating variable names and debug flags.

---
 src/accl/graph/SConscript              |   3 +-
 src/accl/graph/base/base_mem_engine.cc |  20 ++---
 src/accl/graph/base/base_mem_engine.hh |  12 +--
 src/accl/graph/base/data_structs.hh    |  33 +++-----
 src/accl/graph/sega/SConscript         |   3 +
 src/accl/graph/sega/coalesce_engine.cc | 100 ++++++++++++++-----------
 src/accl/graph/sega/coalesce_engine.hh |   9 +--
 src/accl/graph/sega/push_engine.cc     |  53 ++++++-------
 src/accl/graph/sega/push_engine.hh     |   2 +-
 src/accl/graph/sega/wl_engine.cc       |   2 +-
 10 files changed, 113 insertions(+), 124 deletions(-)

diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 36e16affa3..7ca60c30bd 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -28,5 +28,4 @@
 Import('*')
 
 DebugFlag('MPU')
-DebugFlag('SEGAQSize')
-DebugFlag('MahyarMath')
+# CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine'])
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index 64aaa3a737..32c314033d 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -40,7 +40,7 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
     onTheFlyReqs(0),
     respQueueSize(params.resp_queue_size),
-    memAlarmRequested(false),
+    memRetryRequested(false),
     memSpaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     _requestorId(system->getRequestorId(this)),
@@ -111,12 +111,12 @@ BaseMemEngine::processNextMemReqEvent()
                     __func__, pkt->getAddr(), pkt->getSize());
         outstandingMemReqQueue.pop_front();
 
-        if (memAlarmRequested &&
+        if (memRetryRequested &&
             (outstandingMemReqQueue.size() <=
             (outstandingMemReqQueueSize - memSpaceRequested))) {
-            memAlarmRequested = false;
+            memRetryRequested = false;
             memSpaceRequested = 0;
-            respondToMemAlarm();
+            recvMemRetry();
         }
     }
 
@@ -157,7 +157,7 @@ BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 }
 
 bool
-BaseMemEngine::allocateMemReqSpace(int space)
+BaseMemEngine::allocateMemQueueSpace(int space)
 {
     assert((outstandingMemReqQueueSize == 0) ||
         (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
@@ -168,7 +168,7 @@ BaseMemEngine::allocateMemReqSpace(int space)
 }
 
 bool
-BaseMemEngine::memReqQueueFull()
+BaseMemEngine::memQueueFull()
 {
     assert((outstandingMemReqQueueSize == 0) ||
         (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
@@ -180,7 +180,7 @@ BaseMemEngine::memReqQueueFull()
 void
 BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 {
-    panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n");
+    panic_if(memQueueFull(), "Should not enqueue if queue full.\n");
     outstandingMemReqQueue.push_back(pkt);
 
     assert(!outstandingMemReqQueue.empty());
@@ -190,12 +190,12 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 }
 
 void
-BaseMemEngine::requestMemAlarm(int space) {
-    panic_if((memAlarmRequested == true) || (memSpaceRequested != 0),
+BaseMemEngine::requestMemRetry(int space) {
+    panic_if((memRetryRequested == true) || (memSpaceRequested != 0),
             "You should not request another alarm without the first one being"
             "responded to.\n");
     DPRINTF(MPU, "%s: Alarm requested with space = %d.\n", __func__, space);
-    memAlarmRequested = true;
+    memRetryRequested = true;
     memSpaceRequested = space;
 }
 
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
index fc67f3f6d8..64ef49ee1d 100644
--- a/src/accl/graph/base/base_mem_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -71,7 +71,7 @@ class BaseMemEngine : public ClockedObject
     int outstandingMemReqQueueSize;
     int onTheFlyReqs;
     int respQueueSize;
-    bool memAlarmRequested;
+    bool memRetryRequested;
     int memSpaceRequested;
     std::deque<PacketPtr> outstandingMemReqQueue;
 
@@ -83,17 +83,17 @@ class BaseMemEngine : public ClockedObject
 
     size_t peerMemoryAtomSize;
 
-    bool allocateMemReqSpace(int space);
-    bool memReqQueueFull();
+    bool allocateMemQueueSpace(int space);
+    bool memQueueFull();
 
-    bool pendingMemAlarm() { return memAlarmRequested; }
-    void requestMemAlarm(int space);
+    bool pendingMemRetry() { return memRetryRequested; }
+    void requestMemRetry(int space);
 
     void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
     void enqueueMemReq(PacketPtr pkt);
 
     virtual int respBuffSize() = 0;
-    virtual void respondToMemAlarm() = 0;
+    virtual void recvMemRetry() = 0;
     virtual bool handleMemResp(PacketPtr pkt) = 0;
 
     PacketPtr createReadPacket(Addr addr, unsigned int size);
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 9c250c6a2f..f938be72f1 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -93,8 +93,6 @@ template<typename T>
 class FIFOSet
 {
   private:
-    // int numInvalids;
-    std::queue<T> fifo;
     std::unordered_set<T> set;
 
   public:
@@ -107,24 +105,22 @@ class FIFOSet
     {
         if (set.find(item) == set.end()) {
             set.insert(item);
-            fifo.push(item);
         }
     }
 
     void pop_front()
     {
-        T front = fifo.front();
-        set.erase(front);
-        fifo.pop();
+        assert(set.begin() != set.end());
+        set.erase(set.begin());
     }
 
-    T& front()
+    T front()
     {
-        return fifo.front();
+        return *(set.begin());
     }
 
     size_t size() {
-        return fifo.size();
+        return set.size();
     }
 
     bool empty() {
@@ -134,22 +130,11 @@ class FIFOSet
     bool find(T item) {
         return (set.find(item) != set.end());
     }
-};
-
-// template<int SIZE>
-// class BitVector
-// {
-//   private:
-//     int it;
-//     std::bitset<SIZE> bitStore;
-
-//   public:
-//     BitVector(): it(0) { bitStore.reset(); }
 
-//     uint32_t next() {
-
-//     }
-// };
+    void erase(T item) {
+        set.erase(item);
+    }
+};
 
 }
 
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 16fab86ede..77e508f4ed 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -39,3 +39,6 @@ Source('wl_engine.cc')
 
 DebugFlag('ApplyUpdates')
 DebugFlag('CenteralController')
+DebugFlag('CoalesceEngine')
+DebugFlag('PushEngine')
+DebugFlag('WLEngine')
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 1c3f2bcadf..66b8e1fad7 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -31,7 +31,6 @@
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
-#include "debug/MahyarMath.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
@@ -60,7 +59,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
 
     peerPushEngine->registerCoalesceEngine(this, numElementsPerLine);
 
-    needsApply.reset();
+    needsPush.reset();
 }
 
 void
@@ -106,9 +105,6 @@ CoalesceEngine::startup()
 
     nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
     memoryAddressOffset = first_match_addr;
-    DPRINTF(MahyarMath, "%s: Initialized address translation information."
-                        " nmpu: %d, memoryAddressOffset: %lu.\n",
-                        __func__, nmpu, memoryAddressOffset);
 }
 
 void
@@ -128,13 +124,9 @@ CoalesceEngine::getBlockIndex(Addr addr)
 int
 CoalesceEngine::getBitIndexBase(Addr addr)
 {
-    DPRINTF(MahyarMath, "%s: Calculating BitIndexBase for addr %lu.\n",
-                        __func__, addr);
     int atom_index = (int) (addr / (peerMemoryAtomSize * nmpu));
     int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
     int bit_index = atom_index * block_bits;
-    DPRINTF(MahyarMath, "%s: BitIndexBase for addr %lu is %d.\n",
-                        __func__, addr, bit_index);
     return bit_index;
 }
 
@@ -142,17 +134,13 @@ CoalesceEngine::getBitIndexBase(Addr addr)
 Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
 {
-    DPRINTF(MahyarMath, "%s: Calculating BlockAddr for index %d.\n",
-                        __func__, index);
     Addr block_addr = (nmpu * peerMemoryAtomSize) *
         ((int)(index / (peerMemoryAtomSize / sizeof(WorkListItem))));
-    DPRINTF(MahyarMath, "%s: BlockAddr for index %d is %lu.\n",
-                        __func__, index, (block_addr + memoryAddressOffset));
     return (block_addr + memoryAddressOffset);
 }
 
 bool
-CoalesceEngine::recvReadAddr(Addr addr)
+CoalesceEngine::recvWLRead(Addr addr)
 {
     assert(MSHRMap.size() <= numMSHREntry);
     DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
@@ -239,7 +227,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to "
                                 "allocate a cache line for it.\n",
                                 __func__, addr);
-                    if (memReqQueueFull()) {
+                    if (memQueueFull()) {
                         DPRINTF(MPU, "%s: No space in outstandingMemReqQueue. "
                                     "Rejecting  request.\n", __func__);
                         stats.readRejections++;
@@ -326,7 +314,7 @@ CoalesceEngine::processNextRespondEvent()
 }
 
 void
-CoalesceEngine::respondToMemAlarm()
+CoalesceEngine::recvMemRetry()
 {
     assert(!nextEvictEvent.scheduled());
     schedule(nextEvictEvent, nextCycle());
@@ -347,8 +335,16 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         Addr addr = pkt->getAddr();
         int it = getBitIndexBase(addr);
         int block_index = getBlockIndex(addr);
-        bool found_in_cache = (cacheBlocks[block_index].addr == addr);
 
+        bool line_do_push = false;
+        if (cacheBlocks[block_index].addr == addr) {
+            if (cacheBlocks[block_index].busyMask == 0) {
+                assert(applyQueue.find(block_index));
+                line_do_push = true;
+            } else {
+                line_do_push = false;
+            }
+        }
         // We have to send the items regardless of them being found in the
         // cache. However, if they are found in the cache, two things should
         // happen. First, do_push should be set to false and the bit vector
@@ -359,11 +355,19 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         // cache.
         WorkListItem* items = pkt->getPtr<WorkListItem>();
         for (int i = 0; i < numElementsPerLine; i++) {
-            needsApply[it + i] =
-                (needsApply[it + i] == 1) && found_in_cache ? 1 : 0;
-
+            assert(!((needsPush[it + i] == 1) && (items[i].degree == 0)));
+            // TODO: Make this more programmable
+            uint32_t new_prop = std::min(
+                                cacheBlocks[block_index].items[i].prop,
+                                cacheBlocks[block_index].items[i].tempProp);
+            cacheBlocks[block_index].items[i].tempProp = new_prop;
+            cacheBlocks[block_index].items[i].prop = new_prop;
             peerPushEngine->recvWLItemRetry(items[i],
-                ((!found_in_cache) && needsApply[it + i]));
+                (line_do_push && needsPush[it + i]));
+        }
+
+        if (applyQueue.find(block_index)) {
+            applyQueue.erase(block_index);
         }
         return true;
     }
@@ -470,10 +474,6 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         applyQueue.push_back(block_index);
-        int bit_index = getBitIndexBase(cacheBlocks[block_index].addr);
-        for (int i = 0; i < numElementsPerLine; i++) {
-            needsApply[bit_index + i] = 0;
-        }
         DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
                 __func__, block_index, applyQueue.size());
     }
@@ -488,6 +488,10 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextApplyEvent()
 {
+    if (applyQueue.empty()) {
+        return;
+    }
+
     int block_index = applyQueue.front();
 
     if (cacheBlocks[block_index].busyMask) {
@@ -514,13 +518,13 @@ CoalesceEngine::processNextApplyEvent()
 
                 int bit_index =
                         getBitIndexBase(cacheBlocks[block_index].addr) + i;
-
-                assert(needsApply[bit_index] == 0);
-                if (peerPushEngine->allocatePushSpace()) {
-                    peerPushEngine->recvWLItem(
-                        cacheBlocks[block_index].items[i]);
-                } else {
-                    needsApply[bit_index] = 1;
+                if (cacheBlocks[block_index].items[i].degree != 0) {
+                    if (peerPushEngine->allocatePushSpace()) {
+                        peerPushEngine->recvWLItem(
+                            cacheBlocks[block_index].items[i]);
+                    } else {
+                        needsPush[bit_index] = 1;
+                    }
                 }
             }
         }
@@ -536,7 +540,7 @@ CoalesceEngine::processNextApplyEvent()
     applyQueue.pop_front();
 
     if ((!evictQueue.empty()) &&
-        (!pendingMemAlarm()) &&
+        (!pendingMemRetry()) &&
         (!nextEvictEvent.scheduled())) {
         schedule(nextEvictEvent, nextCycle());
     }
@@ -562,13 +566,13 @@ CoalesceEngine::processNextEvictEvent()
         int space_needed = cacheBlocks[block_index].dirty ?
                         (cacheBlocks[block_index].hasConflict ? 2 : 1) :
                         (cacheBlocks[block_index].hasConflict ? 1 : 0);
-        if (!allocateMemReqSpace(space_needed)) {
+        if (!allocateMemQueueSpace(space_needed)) {
             DPRINTF(MPU, "%s: There is not enough space in memReqQueue to "
                     "procees the eviction of cache line [%d]. dirty: %d, "
                     "hasConflict: %d.\n", __func__, block_index,
                     cacheBlocks[block_index].dirty,
                     cacheBlocks[block_index].hasConflict);
-            requestMemAlarm(space_needed);
+            requestMemRetry(space_needed);
             return;
         } else {
             if (cacheBlocks[block_index].dirty) {
@@ -631,7 +635,7 @@ CoalesceEngine::processNextEvictEvent()
 }
 
 void
-CoalesceEngine::respondToPushAlarm()
+CoalesceEngine::recvPushRetry()
 {
     DPRINTF(MPU, "%s: Received a Push alarm.\n", __func__);
     Addr block_addr = 0;
@@ -639,14 +643,15 @@ CoalesceEngine::respondToPushAlarm()
     int it = 0;
     uint32_t slice = 0;
     bool hit_in_cache = false;
+
     for (it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
         for (int i = 0; i < numElementsPerLine; i++) {
             slice <<= 1;
-            slice |= needsApply[it + i];
+            slice |= needsPush[it + i];
         }
         if (slice) {
             block_addr = getBlockAddrFromBitIndex(it);
-            block_index = ((int) (block_addr / peerMemoryAtomSize)) % numLines;
+            block_index = getBlockIndex(block_addr);
             if ((cacheBlocks[block_index].addr == block_addr) &&
                 (cacheBlocks[block_index].valid)) {
                 if (cacheBlocks[block_index].busyMask == 0) {
@@ -662,14 +667,23 @@ CoalesceEngine::respondToPushAlarm()
 
     assert(it < MAX_BITVECTOR_SIZE);
 
-    DPRINTF(MPU, "%s: Found slice %u at %d position in needsApply.\n",
+    DPRINTF(MPU, "%s: Found slice %u at %d position in needsPush.\n",
                 __func__, slice, it);
 
     if (hit_in_cache) {
         for (int i = 0; i < numElementsPerLine; i++) {
+            // TODO: Make this more programmable
+            uint32_t new_prop = std::min(
+                                cacheBlocks[block_index].items[i].prop,
+                                cacheBlocks[block_index].items[i].tempProp);
+            cacheBlocks[block_index].items[i].tempProp = new_prop;
+            cacheBlocks[block_index].items[i].prop = new_prop;
             peerPushEngine->recvWLItemRetry(cacheBlocks[block_index].items[i],
-                                                    (needsApply[it + i] == 1));
-            needsApply[it + i] = 0;
+                                                    (needsPush[it + i] == 1));
+            needsPush[it + i] = 0;
+        }
+        if (applyQueue.find(block_index)) {
+            applyQueue.erase(block_index);
         }
     } else {
         // FIXME: Fix the retry mechanism between memory and cache to
@@ -679,10 +693,10 @@ CoalesceEngine::respondToPushAlarm()
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
         SenderState* sender_state = new SenderState(true);
         pkt->pushSenderState(sender_state);
-        if (allocateMemReqSpace(1)) {
+        if (allocateMemQueueSpace(1)) {
             enqueueMemReq(pkt);
         } else {
-            requestMemAlarm(1);
+            requestMemRetry(1);
         }
     }
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 973ea479c1..0fa555c84a 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -97,8 +97,7 @@ class CoalesceEngine : public BaseMemEngine
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     FIFOSet<int> applyQueue;
-    int needsApplyFirstPointer;
-    std::bitset<MAX_BITVECTOR_SIZE> needsApply;
+    std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
     FIFOSet<int> evictQueue;
 
@@ -137,7 +136,7 @@ class CoalesceEngine : public BaseMemEngine
 
   protected:
     virtual int respBuffSize() { return -1; }
-    virtual void respondToMemAlarm();
+    virtual void recvMemRetry();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
@@ -145,12 +144,12 @@ class CoalesceEngine : public BaseMemEngine
 
     CoalesceEngine(const CoalesceEngineParams &params);
 
-    bool recvReadAddr(Addr addr);
+    bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
     void registerWLEngine(WLEngine* wl_engine);
 
-    void respondToPushAlarm();
+    void recvPushRetry();
 
     void recvFunctional(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index fa611392b4..16e0ca6c6c 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -30,7 +30,7 @@
 
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "debug/MPU.hh"
-#include "debug/SEGAQSize.hh"
+#include "debug/PushEngine.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -109,13 +109,7 @@ PushEngine::ReqPort::recvReqRetry()
 void
 PushEngine::recvWLItem(WorkListItem wl)
 {
-    // If there are no outdoing edges, no need to generate and push
-    // updates. Therefore, we only need to return true.
-    if (wl.degree == 0) {
-        DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n",
-                    __func__, wl.to_string());
-        return;
-    }
+    assert(wl.degree != 0);
 
     assert((pushReqQueueSize == 0) ||
         (pushReqQueue.size() < pushReqQueueSize));
@@ -123,6 +117,7 @@ PushEngine::recvWLItem(WorkListItem wl)
             (pushReqQueueSize != 0), "You should call this method after "
             "checking if there is enough push space. Use allocatePushSpace.\n");
 
+    DPRINTF(PushEngine, "%s: Received %s.\n", __func__, wl.to_string());
     Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
     uint32_t value = wl.prop;
@@ -130,14 +125,9 @@ PushEngine::recvWLItem(WorkListItem wl)
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                                     peerMemoryAtomSize, value);
 
-    if (curTick() % 50000 == 0) {
-        DPRINTF(SEGAQSize, "%s: pushReqQueue.size: %lu.\n",
-                                __func__, pushReqQueue.size());
-    }
-
     assert(!pushReqQueue.empty());
     if ((!nextAddrGenEvent.scheduled()) &&
-        (!memReqQueueFull())) {
+        (!memQueueFull())) {
         schedule(nextAddrGenEvent, nextCycle());
     }
 }
@@ -145,24 +135,22 @@ PushEngine::recvWLItem(WorkListItem wl)
 void
 PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push)
 {
+    DPRINTF(PushEngine, "%s: Received %s with do_push = %s.\n",
+                __func__, wl.to_string(), do_push ? "true" : "false");
     if (do_push) {
         Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
         Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
         uint32_t value = wl.prop;
-        if (wl.degree != 0) {
-            pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                            peerMemoryAtomSize, value);
-        } else {
-            DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n",
-                    __func__, wl.to_string());
-        }
+        assert(wl.degree != 0);
+        pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                                        peerMemoryAtomSize, value);
         numRetries--;
+        if ((!nextAddrGenEvent.scheduled()) &&
+            (!memQueueFull())) {
+            schedule(nextAddrGenEvent, nextCycle());
+        }
     }
     retrySpaceAllocated--;
-    if ((!nextAddrGenEvent.scheduled()) &&
-        (!memReqQueueFull())) {
-        schedule(nextAddrGenEvent, nextCycle());
-    }
 }
 
 void
@@ -173,7 +161,7 @@ PushEngine::processNextAddrGenEvent()
 
     PushPacketInfoGen &curr_info = pushReqQueue.front();
     std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
-    DPRINTF(MPU, "%s: Current packet information generated by "
+    DPRINTF(PushEngine, "%s: Current packet information generated by "
                 "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
                 "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
@@ -185,22 +173,22 @@ PushEngine::processNextAddrGenEvent()
     enqueueMemReq(pkt);
 
     if (curr_info.done()) {
-        DPRINTF(MPU, "%s: Current PushPacketInfoGen is done.\n", __func__);
+        DPRINTF(PushEngine, "%s: Current PushPacketInfoGen is done.\n", __func__);
         pushReqQueue.pop_front();
-        DPRINTF(MPU, "%s: Popped curr_info from pushReqQueue. "
+        DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
         if (numRetries > 0) {
             retrySpaceAllocated++;
         }
         if ((retrySpaceAllocated % numElementsPerLine) == 0) {
-            peerCoalesceEngine->respondToPushAlarm();
+            peerCoalesceEngine->recvPushRetry();
         }
     }
 
-    if (memReqQueueFull()) {
+    if (memQueueFull()) {
         if (!pushReqQueue.empty()) {
-            requestMemAlarm(1);
+            requestMemRetry(1);
         }
         return;
     }
@@ -211,9 +199,10 @@ PushEngine::processNextAddrGenEvent()
 }
 
 void
-PushEngine::respondToMemAlarm()
+PushEngine::recvMemRetry()
 {
     assert(!nextAddrGenEvent.scheduled());
+    DPRINTF(PushEngine, "%s: Responding to a memory alarm.\n", __func__);
     schedule(nextAddrGenEvent, nextCycle());
 }
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 4f388cd7e6..11122067d6 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -143,7 +143,7 @@ class PushEngine : public BaseMemEngine
 
   protected:
     virtual int respBuffSize() { return memRespQueue.size(); }
-    virtual void respondToMemAlarm();
+    virtual void recvMemRetry();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 55a9147ac9..27ba5c40c8 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -139,7 +139,7 @@ WLEngine::processNextReadEvent()
             DPRINTF(MPU, "%s: Entry available in onTheFlyUpdateMap. "
                         "onTheFlyUpdateMap.size: %lu.\n",
                         __func__, onTheFlyUpdateMap.size());
-            if (coalesceEngine->recvReadAddr(update_addr)) {
+            if (coalesceEngine->recvWLRead(update_addr)) {
                 onTheFlyUpdateMap[update_addr] = update_value;
                 DPRINTF(MPU, "%s: Added a new item to onTheFlyUpdateMap. "
                             "onTheFlyUpdateMap[%lu] = %u.\n", __func__,

From 86a72bc496be523600caf672cdd24c14ba484603 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 19 Jul 2022 14:33:22 -0700
Subject: [PATCH 112/247] Somewhat fixing the correctness.

---
 src/accl/graph/sega/coalesce_engine.cc | 97 +++++++++++++++++---------
 src/accl/graph/sega/push_engine.cc     |  3 +-
 2 files changed, 65 insertions(+), 35 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 66b8e1fad7..274d85a5b1 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -117,6 +117,7 @@ CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
 int
 CoalesceEngine::getBlockIndex(Addr addr)
 {
+    assert((addr % peerMemoryAtomSize) == 0);
     return ((int) (addr / peerMemoryAtomSize)) % numLines;
 }
 
@@ -124,6 +125,7 @@ CoalesceEngine::getBlockIndex(Addr addr)
 int
 CoalesceEngine::getBitIndexBase(Addr addr)
 {
+    assert((addr % peerMemoryAtomSize) == 0);
     int atom_index = (int) (addr / (peerMemoryAtomSize * nmpu));
     int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
     int bit_index = atom_index * block_bits;
@@ -134,6 +136,7 @@ CoalesceEngine::getBitIndexBase(Addr addr)
 Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
 {
+    assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
     Addr block_addr = (nmpu * peerMemoryAtomSize) *
         ((int)(index / (peerMemoryAtomSize / sizeof(WorkListItem))));
     return (block_addr + memoryAddressOffset);
@@ -336,39 +339,62 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         int it = getBitIndexBase(addr);
         int block_index = getBlockIndex(addr);
 
-        bool line_do_push = false;
-        if (cacheBlocks[block_index].addr == addr) {
+        if ((cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].valid)) {
+            // We read the address to send the wl but it is put in cache before
+            // the read response arrives.
             if (cacheBlocks[block_index].busyMask == 0) {
-                assert(applyQueue.find(block_index));
-                line_do_push = true;
+                // It is not busy anymore, we have to send the wl from cache.
+                for (int i = 0; i < numElementsPerLine; i++) {
+                    assert(!((needsPush[it + i] == 1) &&
+                            (cacheBlocks[block_index].items[i].degree == 0)));
+                    // TODO: Make this more programmable
+                    uint32_t new_prop = std::min(
+                                        cacheBlocks[block_index].items[i].prop,
+                                        cacheBlocks[block_index].items[i].tempProp);
+                    cacheBlocks[block_index].items[i].tempProp = new_prop;
+                    cacheBlocks[block_index].items[i].prop = new_prop;
+                    peerPushEngine->recvWLItemRetry(
+                        cacheBlocks[block_index].items[i], needsPush[it + i]);
+                    needsPush[it + i] = 0;
+                }
+                // Since we have just applied the line, we can take it out of
+                // the applyQueue if it's in there. No need to do the same
+                // thing for evictQueue.
+                if (applyQueue.find(block_index)) {
+                    applyQueue.erase(block_index);
+                    if (applyQueue.empty() && nextApplyEvent.scheduled()) {
+                        deschedule(nextApplyEvent);
+                    }
+                }
             } else {
-                line_do_push = false;
+                // The line is busy. Therefore, we have to disregard the data
+                // we received from the memory and also tell the push engine to
+                // deallocate the space it allocated for this retry. However,
+                // we still have to rememeber that these items need a retry.
+                // i.e. don't change needsPush, call recvWLItemRetry with
+                // do_push = false
+                for (int i = 0; i < numElementsPerLine; i++) {
+                    assert(!((needsPush[it + i] == 1) &&
+                            (cacheBlocks[block_index].items[i].degree == 0)));
+                    peerPushEngine->recvWLItemRetry(
+                                    cacheBlocks[block_index].items[i], false);
+                }
+            }
+        } else {
+            // We have read the address to send the wl and it is not in the
+            // cache. Simply send the items to the PushEngine.
+            WorkListItem* items = pkt->getPtr<WorkListItem>();
+            // No applying of the line needed.
+            for (int i = 0; i < numElementsPerLine; i++) {
+                assert(!((needsPush[it + i] == 1) &&
+                                (items[i].degree == 0)));
+                peerPushEngine->recvWLItemRetry(items[i], needsPush[it + i]);
+                needsPush[it + i] = 0;
             }
-        }
-        // We have to send the items regardless of them being found in the
-        // cache. However, if they are found in the cache, two things should
-        // happen. First, do_push should be set to false and the bit vector
-        // value for the items should not change. To future Mahyar and Marjan,
-        // If this is confusing, please look at where each item is pushed to
-        // the apply queue. Hint: Think about updates that might not be sent
-        // out if you reset the bit regardless of the line being found in the
-        // cache.
-        WorkListItem* items = pkt->getPtr<WorkListItem>();
-        for (int i = 0; i < numElementsPerLine; i++) {
-            assert(!((needsPush[it + i] == 1) && (items[i].degree == 0)));
-            // TODO: Make this more programmable
-            uint32_t new_prop = std::min(
-                                cacheBlocks[block_index].items[i].prop,
-                                cacheBlocks[block_index].items[i].tempProp);
-            cacheBlocks[block_index].items[i].tempProp = new_prop;
-            cacheBlocks[block_index].items[i].prop = new_prop;
-            peerPushEngine->recvWLItemRetry(items[i],
-                (line_do_push && needsPush[it + i]));
         }
 
-        if (applyQueue.find(block_index)) {
-            applyQueue.erase(block_index);
-        }
+        delete pkt;
         return true;
     }
 
@@ -488,9 +514,9 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextApplyEvent()
 {
-    if (applyQueue.empty()) {
-        return;
-    }
+    // if (applyQueue.empty()) {
+    //     return;
+    // }
 
     int block_index = applyQueue.front();
 
@@ -515,10 +541,12 @@ CoalesceEngine::processNextApplyEvent()
                 DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n", __func__,
                     cacheBlocks[block_index].addr + (i  * sizeof(WorkListItem)),
                     cacheBlocks[block_index].items[i].to_string());
-
                 int bit_index =
                         getBitIndexBase(cacheBlocks[block_index].addr) + i;
-                if (cacheBlocks[block_index].items[i].degree != 0) {
+                if ((cacheBlocks[block_index].items[i].degree != 0) &&
+                    (needsPush[bit_index] == 0)) {
+                    // If the respective bit in the bit vector is set
+                    // there is no need to try and resend it.
                     if (peerPushEngine->allocatePushSpace()) {
                         peerPushEngine->recvWLItem(
                             cacheBlocks[block_index].items[i]);
@@ -684,6 +712,9 @@ CoalesceEngine::recvPushRetry()
         }
         if (applyQueue.find(block_index)) {
             applyQueue.erase(block_index);
+            if (applyQueue.empty() && nextApplyEvent.scheduled()) {
+                deschedule(nextApplyEvent);
+            }
         }
     } else {
         // FIXME: Fix the retry mechanism between memory and cache to
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 16e0ca6c6c..044429f8fc 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -97,12 +97,11 @@ PushEngine::ReqPort::recvReqRetry()
     sendPacket(blockedPacket);
 
     if (!_blocked) {
+        blockedPacket = nullptr;
         DPRINTF(MPU, "%s: Sent the blockedPacket. "
                     "_blocked: %s, (blockedPacket == nullptr): %s.\n",
                     __func__, _blocked ? "true" : "false",
                     (blockedPacket == nullptr) ? "true" : "false");
-
-        blockedPacket = nullptr;
     }
 }
 

From 9f4c1f31be4bf999b1b525e604999d529f33e41b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 20 Jul 2022 01:31:49 -0700
Subject: [PATCH 113/247] Almost fixed retry bugs. 14 wrong vertices in lj.

---
 configs/accl/sega.py                   |   2 +-
 src/accl/graph/base/base_mem_engine.cc |  18 ++---
 src/accl/graph/sega/coalesce_engine.cc |  95 ++++++++++++++++++-----
 src/accl/graph/sega/coalesce_engine.hh |   5 ++
 src/accl/graph/sega/push_engine.cc     | 101 +++++++++++++++++++------
 src/accl/graph/sega/push_engine.hh     |   4 +-
 6 files changed, 170 insertions(+), 55 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 65645b3bb3..eb209911be 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -174,5 +174,5 @@ def get_inputs():
     m5.instantiate()
 
     exit_event = m5.simulate()
-    print("Simulation finished!")
+    print(f"Exited simulation because {exit_event.getCause()}")
     exit()
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index 32c314033d..e05357950b 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -97,12 +97,8 @@ BaseMemEngine::MemPort::recvReqRetry()
 void
 BaseMemEngine::processNextMemReqEvent()
 {
-    if (memPort.blocked()) {
-        return;
-    }
-
-    if (((respBuffSize() + onTheFlyReqs) < respQueueSize) ||
-        (respQueueSize == 0)) {
+    if ((respQueueSize == 0) ||
+        ((respBuffSize() + onTheFlyReqs) < respQueueSize)) {
         PacketPtr pkt = outstandingMemReqQueue.front();
         memPort.sendPacket(pkt);
         onTheFlyReqs++;
@@ -120,7 +116,8 @@ BaseMemEngine::processNextMemReqEvent()
         }
     }
 
-    if ((!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
+    if ((!memPort.blocked()) &&
+        (!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
         schedule(nextMemReqEvent, nextCycle());
     }
 }
@@ -183,8 +180,7 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt)
     panic_if(memQueueFull(), "Should not enqueue if queue full.\n");
     outstandingMemReqQueue.push_back(pkt);
 
-    assert(!outstandingMemReqQueue.empty());
-    if (!nextMemReqEvent.scheduled()) {
+    if ((!nextMemReqEvent.scheduled()) && (!memPort.blocked())) {
         schedule(nextMemReqEvent, nextCycle());
     }
 }
@@ -202,8 +198,8 @@ BaseMemEngine::requestMemRetry(int space) {
 void
 BaseMemEngine::wakeUp()
 {
-    if ((!nextMemReqEvent.scheduled()) &&
-        (!outstandingMemReqQueue.empty())) {
+    assert(!nextMemReqEvent.scheduled());
+    if (!outstandingMemReqQueue.empty()) {
         schedule(nextMemReqEvent, nextCycle());
     }
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 274d85a5b1..dde6e46aa9 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -31,6 +31,7 @@
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
+#include "debug/CoalesceEngine.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
@@ -44,11 +45,14 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
+    currentBitSliceIndex(0),
+    numRetriesReceived(0),
     applyQueue(numLines),
     evictQueue(numLines),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
     nextEvictEvent([this] { processNextEvictEvent(); }, name()),
+    nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -344,6 +348,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // We read the address to send the wl but it is put in cache before
             // the read response arrives.
             if (cacheBlocks[block_index].busyMask == 0) {
+                DPRINTF(CoalesceEngine, "%s: Received read response for retry "
+                        "for addr %lu. It was found in the cache as idle.\n",
+                        __func__, addr);
+                int push_needed = 0;
                 // It is not busy anymore, we have to send the wl from cache.
                 for (int i = 0; i < numElementsPerLine; i++) {
                     assert(!((needsPush[it + i] == 1) &&
@@ -354,10 +362,15 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                         cacheBlocks[block_index].items[i].tempProp);
                     cacheBlocks[block_index].items[i].tempProp = new_prop;
                     cacheBlocks[block_index].items[i].prop = new_prop;
-                    peerPushEngine->recvWLItemRetry(
-                        cacheBlocks[block_index].items[i], needsPush[it + i]);
+                    if (needsPush[it + i] == 1) {
+                        peerPushEngine->recvWLItemRetry(
+                            cacheBlocks[block_index].items[i]);
+                    }
+                    push_needed += needsPush[it + i];
                     needsPush[it + i] = 0;
                 }
+                peerPushEngine->deallocatePushSpace(
+                                        numElementsPerLine - push_needed);
                 // Since we have just applied the line, we can take it out of
                 // the applyQueue if it's in there. No need to do the same
                 // thing for evictQueue.
@@ -366,6 +379,13 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     if (applyQueue.empty() && nextApplyEvent.scheduled()) {
                         deschedule(nextApplyEvent);
                     }
+                    if (cacheBlocks[block_index].hasConflict) {
+                        evictQueue.push_back(block_index);
+                        if ((!nextEvictEvent.scheduled()) &&
+                            (!pendingMemRetry())) {
+                            schedule(nextEvictEvent, nextCycle());
+                        }
+                    }
                 }
             } else {
                 // The line is busy. Therefore, we have to disregard the data
@@ -374,24 +394,31 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 // we still have to rememeber that these items need a retry.
                 // i.e. don't change needsPush, call recvWLItemRetry with
                 // do_push = false
-                for (int i = 0; i < numElementsPerLine; i++) {
-                    assert(!((needsPush[it + i] == 1) &&
-                            (cacheBlocks[block_index].items[i].degree == 0)));
-                    peerPushEngine->recvWLItemRetry(
-                                    cacheBlocks[block_index].items[i], false);
-                }
+                DPRINTF(CoalesceEngine, "%s: Received read response for retry "
+                        "for addr %lu. It was found in the cache as busy.\n",
+                        __func__, addr);
+                peerPushEngine->deallocatePushSpace(numElementsPerLine);
             }
         } else {
             // We have read the address to send the wl and it is not in the
             // cache. Simply send the items to the PushEngine.
+            DPRINTF(CoalesceEngine, "%s: Received read response for retry "
+                        "for addr %lu. It was not found in the cache.\n",
+                        __func__, addr);
             WorkListItem* items = pkt->getPtr<WorkListItem>();
+            int push_needed = 0;
             // No applying of the line needed.
             for (int i = 0; i < numElementsPerLine; i++) {
                 assert(!((needsPush[it + i] == 1) &&
                                 (items[i].degree == 0)));
-                peerPushEngine->recvWLItemRetry(items[i], needsPush[it + i]);
+                if (needsPush[it + i] == 1) {
+                    peerPushEngine->recvWLItemRetry(items[i]);
+                }
+                push_needed += needsPush[it + i];
                 needsPush[it + i] = 0;
             }
+            peerPushEngine->deallocatePushSpace(
+                                    numElementsPerLine - push_needed);
         }
 
         delete pkt;
@@ -514,10 +541,6 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextApplyEvent()
 {
-    // if (applyQueue.empty()) {
-    //     return;
-    // }
-
     int block_index = applyQueue.front();
 
     if (cacheBlocks[block_index].busyMask) {
@@ -665,14 +688,23 @@ CoalesceEngine::processNextEvictEvent()
 void
 CoalesceEngine::recvPushRetry()
 {
-    DPRINTF(MPU, "%s: Received a Push alarm.\n", __func__);
+    numRetriesReceived++;
+    if (!nextSendRetryEvent.scheduled()) {
+        schedule(nextSendRetryEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextSendRetryEvent()
+{
+    DPRINTF(MPU, "%s: Received a push retry.\n", __func__);
     Addr block_addr = 0;
     int block_index = 0;
     int it = 0;
     uint32_t slice = 0;
     bool hit_in_cache = false;
 
-    for (it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
+    for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
         for (int i = 0; i < numElementsPerLine; i++) {
             slice <<= 1;
             slice |= needsPush[it + i];
@@ -691,14 +723,23 @@ CoalesceEngine::recvPushRetry()
                 break;
             }
         }
+        if (it == (MAX_BITVECTOR_SIZE - numElementsPerLine)) {
+            it = 0;
+        }
     }
 
     assert(it < MAX_BITVECTOR_SIZE);
+    if ((it + numElementsPerLine) > MAX_BITVECTOR_SIZE) {
+        currentBitSliceIndex = 0;
+    } else {
+        currentBitSliceIndex = it + numElementsPerLine;
+    }
 
-    DPRINTF(MPU, "%s: Found slice %u at %d position in needsPush.\n",
-                __func__, slice, it);
+    DPRINTF(CoalesceEngine, "%s: Found slice with value %d at position %d "
+                        "in needsPush.\n", __func__, slice, it);
 
     if (hit_in_cache) {
+        int push_needed = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
             // TODO: Make this more programmable
             uint32_t new_prop = std::min(
@@ -706,15 +747,26 @@ CoalesceEngine::recvPushRetry()
                                 cacheBlocks[block_index].items[i].tempProp);
             cacheBlocks[block_index].items[i].tempProp = new_prop;
             cacheBlocks[block_index].items[i].prop = new_prop;
-            peerPushEngine->recvWLItemRetry(cacheBlocks[block_index].items[i],
-                                                    (needsPush[it + i] == 1));
+            if (needsPush[it + i] == 1) {
+                peerPushEngine->recvWLItemRetry(
+                    cacheBlocks[block_index].items[i]);
+            }
+            push_needed +=  needsPush[it + i];
             needsPush[it + i] = 0;
         }
+        peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
         if (applyQueue.find(block_index)) {
             applyQueue.erase(block_index);
             if (applyQueue.empty() && nextApplyEvent.scheduled()) {
                 deschedule(nextApplyEvent);
             }
+            if (cacheBlocks[block_index].hasConflict) {
+                evictQueue.push_back(block_index);
+                if ((!nextEvictEvent.scheduled()) &&
+                    (!pendingMemRetry())) {
+                    schedule(nextEvictEvent, nextCycle());
+                }
+            }
         }
     } else {
         // FIXME: Fix the retry mechanism between memory and cache to
@@ -730,6 +782,11 @@ CoalesceEngine::recvPushRetry()
             requestMemRetry(1);
         }
     }
+
+    numRetriesReceived--;
+    if ((numRetriesReceived > 0) && (!nextSendRetryEvent.scheduled())) {
+        schedule(nextSendRetryEvent, nextCycle());
+    }
 }
 
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 0fa555c84a..e1033a4622 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -96,6 +96,8 @@ class CoalesceEngine : public BaseMemEngine
 
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
+    int currentBitSliceIndex;
+    int numRetriesReceived;
     FIFOSet<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
@@ -114,6 +116,9 @@ class CoalesceEngine : public BaseMemEngine
     EventFunctionWrapper nextEvictEvent;
     void processNextEvictEvent();
 
+    EventFunctionWrapper nextSendRetryEvent;
+    void processNextSendRetryEvent();
+
     struct CoalesceStats : public statistics::Group
     {
       CoalesceStats(CoalesceEngine &coalesce);
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 044429f8fc..d493b34c53 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -105,6 +105,35 @@ PushEngine::ReqPort::recvReqRetry()
     }
 }
 
+void
+PushEngine::deallocatePushSpace(int space)
+{
+    retrySpaceAllocated -= space;
+    DPRINTF(PushEngine, "%s: Deallocated %d spaces. numRetries = %d, "
+            "nextAddrGenEvent.scheduled() = %s, pendingMemRetry() = %s, "
+            "pushReqQueue.size() = %d, retrySpaceAllocated = %d.\n",
+            __func__, space, numRetries,
+            nextAddrGenEvent.scheduled() ? "true" : "false",
+            pendingMemRetry() ? "true" : "false",
+            pushReqQueue.size(), retrySpaceAllocated);
+    /// DISCUSS: Might have to check whether the addrGenEvent is scheduled
+    // and or the pushReqQueue is empty. If so we might need to
+    // send retries.
+    if ((numRetries > 0)  &&
+        ((pushReqQueue.size() + retrySpaceAllocated) == 0)) {
+        assert((!pendingMemRetry()) && (!nextAddrGenEvent.scheduled()));
+        int free_space =
+            pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
+        if (free_space > numElementsPerLine) {
+            DPRINTF(PushEngine, "%s: Found %d free spaces. "
+                    "retrySpaceAllocated = %d.\n", __func__, free_space,
+                    retrySpaceAllocated);
+            retrySpaceAllocated += numElementsPerLine;
+            peerCoalesceEngine->recvPushRetry();
+        }
+    }
+}
+
 void
 PushEngine::recvWLItem(WorkListItem wl)
 {
@@ -124,32 +153,41 @@ PushEngine::recvWLItem(WorkListItem wl)
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                                     peerMemoryAtomSize, value);
 
-    assert(!pushReqQueue.empty());
-    if ((!nextAddrGenEvent.scheduled()) &&
-        (!memQueueFull())) {
-        schedule(nextAddrGenEvent, nextCycle());
+    if ((!nextAddrGenEvent.scheduled())) {
+        if (memQueueFull()) {
+            if (!pendingMemRetry()) {
+                requestMemRetry(1);
+            }
+        } else {
+            schedule(nextAddrGenEvent, nextCycle());
+        }
     }
 }
 
 void
-PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push)
+PushEngine::recvWLItemRetry(WorkListItem wl)
 {
-    DPRINTF(PushEngine, "%s: Received %s with do_push = %s.\n",
-                __func__, wl.to_string(), do_push ? "true" : "false");
-    if (do_push) {
-        Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
-        Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
-        uint32_t value = wl.prop;
-        assert(wl.degree != 0);
-        pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                        peerMemoryAtomSize, value);
-        numRetries--;
-        if ((!nextAddrGenEvent.scheduled()) &&
-            (!memQueueFull())) {
+    assert(wl.degree != 0);
+    DPRINTF(PushEngine, "%s: Received %s with retry.\n",
+                                __func__, wl.to_string());
+
+    Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
+    Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
+    uint32_t value = wl.prop;
+
+    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                                    peerMemoryAtomSize, value);
+    numRetries--;
+    retrySpaceAllocated--;
+    if ((!nextAddrGenEvent.scheduled())) {
+        if (memQueueFull()) {
+            if (!pendingMemRetry()) {
+                requestMemRetry(1);
+            }
+        } else {
             schedule(nextAddrGenEvent, nextCycle());
         }
     }
-    retrySpaceAllocated--;
 }
 
 void
@@ -177,11 +215,27 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
+        // if ((numRetries > 0) &&
+        //     ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
+        //     retrySpaceAllocated++;
+        //     DPRINTF(PushEngine, "%s: Allocated 1 space for retry. "
+        //                     "retrySpaceAllocated = %d.\n",
+        //                     __func__, retrySpaceAllocated);
+        //     if ((retrySpaceAllocated % numElementsPerLine) == 0) {
+        //         peerCoalesceEngine->recvPushRetry();
+        //     }
+        // }
         if (numRetries > 0) {
-            retrySpaceAllocated++;
-        }
-        if ((retrySpaceAllocated % numElementsPerLine) == 0) {
-            peerCoalesceEngine->recvPushRetry();
+            int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
+            DPRINTF(PushEngine, "%s: Found %d free spaces in "
+                            "the pushReqQueue.\n", __func__, free_space);
+            if (free_space > numElementsPerLine) {
+                retrySpaceAllocated += numElementsPerLine;
+                DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. "
+                        "retrySpaceAllocated = %d.\n", __func__, free_space,
+                        retrySpaceAllocated);
+                peerCoalesceEngine->recvPushRetry();
+            }
         }
     }
 
@@ -201,7 +255,7 @@ void
 PushEngine::recvMemRetry()
 {
     assert(!nextAddrGenEvent.scheduled());
-    DPRINTF(PushEngine, "%s: Responding to a memory alarm.\n", __func__);
+    DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__);
     schedule(nextAddrGenEvent, nextCycle());
 }
 
@@ -285,6 +339,7 @@ PushEngine::createUpdatePacket(Addr addr, T value)
 
 bool
 PushEngine::allocatePushSpace() {
+    assert(retrySpaceAllocated >= 0);
     if ((pushReqQueueSize == 0) ||
         ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
         return true;
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 11122067d6..9025ae9946 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -155,9 +155,11 @@ class PushEngine : public BaseMemEngine
 
     bool allocatePushSpace();
 
+    void deallocatePushSpace(int space);
+
     void recvWLItem(WorkListItem wl);
 
-    void recvWLItemRetry(WorkListItem wl, bool do_push);
+    void recvWLItemRetry(WorkListItem wl);
 
     void registerCoalesceEngine(CoalesceEngine* coalesce_engine,
                                           int elements_per_line);

From e54f3c1c05a637cea9d8385253edd25fdd7e0b78 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 20 Jul 2022 11:36:14 -0700
Subject: [PATCH 114/247] Deleting comments and updating config.

---
 configs/accl/sega.py               | 14 +++++++-------
 src/accl/graph/sega/push_engine.cc | 14 ++------------
 2 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index eb209911be..15431088d2 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,20 +9,20 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=4,
+                                    push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    outstanding_mem_req_queue_size=1,
-                                    resp_queue_size=1)
+                                    outstanding_mem_req_queue_size=64,
+                                    resp_queue_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
                                     cache_size="1MiB",
-                                    num_mshr_entry=1,
-                                    num_tgts_per_mshr=1,
+                                    num_mshr_entry=32,
+                                    num_tgts_per_mshr=4,
                                     outstanding_mem_req_queue_size=0)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=1,
-                                on_the_fly_update_map_size=1)
+                                update_queue_size=64,
+                                on_the_fly_update_map_size=16)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d493b34c53..e87f4d275e 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -215,16 +215,6 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
-        // if ((numRetries > 0) &&
-        //     ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
-        //     retrySpaceAllocated++;
-        //     DPRINTF(PushEngine, "%s: Allocated 1 space for retry. "
-        //                     "retrySpaceAllocated = %d.\n",
-        //                     __func__, retrySpaceAllocated);
-        //     if ((retrySpaceAllocated % numElementsPerLine) == 0) {
-        //         peerCoalesceEngine->recvPushRetry();
-        //     }
-        // }
         if (numRetries > 0) {
             int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
             DPRINTF(PushEngine, "%s: Found %d free spaces in "
@@ -232,8 +222,8 @@ PushEngine::processNextAddrGenEvent()
             if (free_space > numElementsPerLine) {
                 retrySpaceAllocated += numElementsPerLine;
                 DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. "
-                        "retrySpaceAllocated = %d.\n", __func__, free_space,
-                        retrySpaceAllocated);
+                        "retrySpaceAllocated = %d.\n", __func__,
+                        numElementsPerLine, retrySpaceAllocated);
                 peerCoalesceEngine->recvPushRetry();
             }
         }

From 5a27472b412574e5f3d02f2be34af319c9e70296 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 20 Jul 2022 14:12:33 -0700
Subject: [PATCH 115/247] Adding a new debug print.

---
 src/accl/graph/sega/coalesce_engine.cc | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index dde6e46aa9..e7e528aaf5 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -353,6 +353,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         __func__, addr);
                 int push_needed = 0;
                 // It is not busy anymore, we have to send the wl from cache.
+                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
                 for (int i = 0; i < numElementsPerLine; i++) {
                     assert(!((needsPush[it + i] == 1) &&
                             (cacheBlocks[block_index].items[i].degree == 0)));
@@ -369,6 +371,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     push_needed += needsPush[it + i];
                     needsPush[it + i] = 0;
                 }
+                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
                 peerPushEngine->deallocatePushSpace(
                                         numElementsPerLine - push_needed);
                 // Since we have just applied the line, we can take it out of
@@ -397,7 +401,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 DPRINTF(CoalesceEngine, "%s: Received read response for retry "
                         "for addr %lu. It was found in the cache as busy.\n",
                         __func__, addr);
+                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
                 peerPushEngine->deallocatePushSpace(numElementsPerLine);
+                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
             }
         } else {
             // We have read the address to send the wl and it is not in the
@@ -408,6 +416,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             WorkListItem* items = pkt->getPtr<WorkListItem>();
             int push_needed = 0;
             // No applying of the line needed.
+            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
             for (int i = 0; i < numElementsPerLine; i++) {
                 assert(!((needsPush[it + i] == 1) &&
                                 (items[i].degree == 0)));
@@ -417,6 +427,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 push_needed += needsPush[it + i];
                 needsPush[it + i] = 0;
             }
+            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
             peerPushEngine->deallocatePushSpace(
                                     numElementsPerLine - push_needed);
         }
@@ -740,6 +752,8 @@ CoalesceEngine::processNextSendRetryEvent()
 
     if (hit_in_cache) {
         int push_needed = 0;
+        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
         for (int i = 0; i < numElementsPerLine; i++) {
             // TODO: Make this more programmable
             uint32_t new_prop = std::min(
@@ -754,6 +768,8 @@ CoalesceEngine::processNextSendRetryEvent()
             push_needed +=  needsPush[it + i];
             needsPush[it + i] = 0;
         }
+        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
         peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
         if (applyQueue.find(block_index)) {
             applyQueue.erase(block_index);

From 590c8a8870a475383faf26890c014a85bd9068ec Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 20 Jul 2022 15:51:58 -0700
Subject: [PATCH 116/247] Updating debug flags. Adding one per comp.

---
 configs/accl/sega.py                   | 14 ++--
 src/accl/graph/SConscript              |  4 +-
 src/accl/graph/base/SConscript         |  1 +
 src/accl/graph/base/base_mem_engine.cc |  6 +-
 src/accl/graph/sega/coalesce_engine.cc | 91 +++++++++++++-------------
 src/accl/graph/sega/push_engine.cc     |  9 ++-
 src/accl/graph/sega/wl_engine.cc       | 44 ++++++-------
 7 files changed, 82 insertions(+), 87 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 15431088d2..eb209911be 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,20 +9,20 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=32,
+                                    push_req_queue_size=4,
                                     attached_memory_atom_size=64,
-                                    outstanding_mem_req_queue_size=64,
-                                    resp_queue_size=64)
+                                    outstanding_mem_req_queue_size=1,
+                                    resp_queue_size=1)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
                                     cache_size="1MiB",
-                                    num_mshr_entry=32,
-                                    num_tgts_per_mshr=4,
+                                    num_mshr_entry=1,
+                                    num_tgts_per_mshr=1,
                                     outstanding_mem_req_queue_size=0)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=64,
-                                on_the_fly_update_map_size=16)
+                                update_queue_size=1,
+                                on_the_fly_update_map_size=1)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 7ca60c30bd..f5f7e962af 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -27,5 +27,5 @@
 
 Import('*')
 
-DebugFlag('MPU')
-# CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine'])
+
+CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine', 'BaseMemEngine'])
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 4c90dfa9a6..45877a12ca 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -32,3 +32,4 @@ SimObject('BaseReduceEngine.py')
 
 Source('base_mem_engine.cc')
 Source('base_reduce_engine.cc')
+DebugFlag('BaseMemEngine')
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index e05357950b..cb4c1d81bb 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -28,7 +28,7 @@
 
 #include "accl/graph/base/base_mem_engine.hh"
 
-#include "debug/MPU.hh"
+#include "debug/BaseMemEngine.hh"
 
 namespace gem5
 {
@@ -102,7 +102,7 @@ BaseMemEngine::processNextMemReqEvent()
         PacketPtr pkt = outstandingMemReqQueue.front();
         memPort.sendPacket(pkt);
         onTheFlyReqs++;
-        DPRINTF(MPU, "%s: Sent a packet to memory with the following info. "
+        DPRINTF(BaseMemEngine, "%s: Sent a packet to memory with the following info. "
                     "pkt->addr: %lu, pkt->size: %lu.\n",
                     __func__, pkt->getAddr(), pkt->getSize());
         outstandingMemReqQueue.pop_front();
@@ -190,7 +190,7 @@ BaseMemEngine::requestMemRetry(int space) {
     panic_if((memRetryRequested == true) || (memSpaceRequested != 0),
             "You should not request another alarm without the first one being"
             "responded to.\n");
-    DPRINTF(MPU, "%s: Alarm requested with space = %d.\n", __func__, space);
+    DPRINTF(BaseMemEngine, "%s: Alarm requested with space = %d.\n", __func__, space);
     memRetryRequested = true;
     memSpaceRequested = space;
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index e7e528aaf5..522feebace 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -32,7 +32,6 @@
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
 #include "debug/CoalesceEngine.hh"
-#include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -150,7 +149,7 @@ bool
 CoalesceEngine::recvWLRead(Addr addr)
 {
     assert(MSHRMap.size() <= numMSHREntry);
-    DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
+    DPRINTF(CoalesceEngine,  "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
     Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
     assert(aligned_addr % peerMemoryAtomSize == 0);
@@ -167,7 +166,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         // the future.
         responseQueue.push_back(std::make_tuple(addr,
                     cacheBlocks[block_index].items[wl_offset]));
-        DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
             "to responseQueue. responseQueue.size = %d.\n",
             __func__, addr, block_index, wl_offset,
             cacheBlocks[block_index].items[wl_offset].to_string(),
@@ -184,28 +183,28 @@ CoalesceEngine::recvWLRead(Addr addr)
         return true;
     } else {
         // miss
-        DPRINTF(MPU, "%s: Addr: %lu is a miss.\n", __func__, addr);
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHRMap.find(block_index) == MSHRMap.end()) {
-            DPRINTF(MPU, "%s: Respective cache line[%d] for Addr: %lu not "
+            DPRINTF(CoalesceEngine,  "%s: Respective cache line[%d] for Addr: %lu not "
                         "found in MSHRs.\n", __func__, block_index, addr);
             assert(MSHRMap.size() <= numMSHREntry);
             if (MSHRMap.size() == numMSHREntry) {
                 // Out of MSHR entries
-                DPRINTF(MPU, "%s: Out of MSHR entries. "
+                DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
                             "Rejecting request.\n", __func__);
                 // TODO: Break out read rejections into more than one stat
                 // based on the cause of the rejection
                 stats.readRejections++;
                 return false;
             } else {
-                DPRINTF(MPU, "%s: MSHR entries available.\n", __func__);
+                DPRINTF(CoalesceEngine,  "%s: MSHR entries available.\n", __func__);
                 if (cacheBlocks[block_index].allocated) {
                     assert(MSHRMap[block_index].size() <= numTgtsPerMSHR);
-                    DPRINTF(MPU, "%s: Addr: %lu has a conflict "
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                                 "with Addr: %lu.\n", __func__, addr,
                                 cacheBlocks[block_index].addr);
                     if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
-                        DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
+                        DPRINTF(CoalesceEngine,  "%s: Out of targets for cache line[%d]. "
                                     "Rejecting request.\n",
                                     __func__, block_index);
                         stats.readRejections++;
@@ -213,13 +212,13 @@ CoalesceEngine::recvWLRead(Addr addr)
                     }
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
-                    DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
                                 "line[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
                     stats.numVertexReads++;
                     if (!cacheBlocks[block_index].busyMask) {
                         applyQueue.push_back(block_index);
-                        DPRINTF(MPU, "%s: Added %d to applyQueue. "
+                        DPRINTF(CoalesceEngine,  "%s: Added %d to applyQueue. "
                                     "applyQueue.size = %u.\n", __func__,
                                     block_index, applyQueue.size());
                         assert(!applyQueue.empty());
@@ -231,11 +230,11 @@ CoalesceEngine::recvWLRead(Addr addr)
                 } else {
                     assert(!cacheBlocks[block_index].valid);
                     // MSHR available and no conflict
-                    DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to "
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. Trying to "
                                 "allocate a cache line for it.\n",
                                 __func__, addr);
                     if (memQueueFull()) {
-                        DPRINTF(MPU, "%s: No space in outstandingMemReqQueue. "
+                        DPRINTF(CoalesceEngine,  "%s: No space in outstandingMemReqQueue. "
                                     "Rejecting  request.\n", __func__);
                         stats.readRejections++;
                         return false;
@@ -245,19 +244,19 @@ CoalesceEngine::recvWLRead(Addr addr)
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
-                    DPRINTF(MPU, "%s: Allocated cache line[%d] for "
+                    DPRINTF(CoalesceEngine,  "%s: Allocated cache line[%d] for "
                                 "Addr: %lu.\n", __func__, block_index, addr);
 
                     MSHRMap[block_index].push_back(addr);
-                    DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
                                 "line[%d].\n", __func__, addr, block_index);
 
                     PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
-                    DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                    DPRINTF(CoalesceEngine,  "%s: Created a read packet for Addr: %lu."
                                 " req addr (aligned_addr) = %lu, size = %d.\n",
                                 __func__, addr, aligned_addr, peerMemoryAtomSize);
                     enqueueMemReq(pkt);
-                    DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue.\n",
+                    DPRINTF(CoalesceEngine,  "%s: Pushed pkt to outstandingMemReqQueue.\n",
                                                                     __func__);
                     stats.readMisses++;
                     stats.numVertexReads++;
@@ -265,10 +264,10 @@ CoalesceEngine::recvWLRead(Addr addr)
                 }
             }
         } else {
-            DPRINTF(MPU, "%s: Respective cache line[%d] for Addr: %lu already "
+            DPRINTF(CoalesceEngine,  "%s: Respective cache line[%d] for Addr: %lu already "
                         "in MSHRs.\n", __func__, block_index, addr);
             if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
-                DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
+                DPRINTF(CoalesceEngine,  "%s: Out of targets for cache line[%d]. "
                             "Rejecting request.\n",
                             __func__, block_index);
                 stats.readRejections++;
@@ -276,7 +275,7 @@ CoalesceEngine::recvWLRead(Addr addr)
             }
             if ((!cacheBlocks[block_index].hasConflict) &&
                 (aligned_addr != cacheBlocks[block_index].addr)) {
-                DPRINTF(MPU, "%s: Addr: %lu has a conflict "
+                DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                             "with Addr: %lu.\n", __func__, addr,
                             cacheBlocks[block_index].addr);
                 cacheBlocks[block_index].hasConflict = true;
@@ -289,7 +288,7 @@ CoalesceEngine::recvWLRead(Addr addr)
             }
 
             MSHRMap[block_index].push_back(addr);
-            DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
                             "line[%d].\n", __func__, addr, block_index);
             stats.numVertexReads++;
             return true;
@@ -306,11 +305,11 @@ CoalesceEngine::processNextRespondEvent()
 
     std::tie(addr_response, worklist_response) = responseQueue.front();
     peerWLEngine->handleIncomingWL(addr_response, worklist_response);
-    DPRINTF(MPU, "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n",
+    DPRINTF(CoalesceEngine,  "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n",
                 __func__, worklist_response.to_string(), addr_response);
 
     responseQueue.pop_front();
-    DPRINTF(MPU, "%s: Popped a response from responseQueue. "
+    DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
                 "responseQueue.size = %d.\n", __func__,
                 responseQueue.size());
 
@@ -333,7 +332,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     assert(pkt->isResponse());
     if (pkt->isWrite()) {
         delete pkt;
-        DPRINTF(MPU, "%s: Received a write response for Addr: %lu. Dropping "
+        DPRINTF(CoalesceEngine,  "%s: Received a write response for Addr: %lu. Dropping "
                     "the packet.\n", __func__, pkt->getAddr());
         return true;
     }
@@ -440,7 +439,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     Addr addr = pkt->getAddr();
     int block_index = (addr / peerMemoryAtomSize) % numLines;
 
-    DPRINTF(MPU, "%s: Received a read resposne for Addr: %lu.\n",
+    DPRINTF(CoalesceEngine,  "%s: Received a read resposne for Addr: %lu.\n",
                 __func__, pkt->getAddr());
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
@@ -449,7 +448,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                                 peerMemoryAtomSize);
 
     for (int i = 0; i < numElementsPerLine; i++) {
-        DPRINTF(MPU, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
+        DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
                 block_index, i, cacheBlocks[block_index].items[i].to_string());
     }
     cacheBlocks[block_index].valid = true;
@@ -462,13 +461,13 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         Addr aligned_miss_addr = roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-            DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
+            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for cache line[%d] could "
                         "be serviced with the received packet.\n",
                         __func__, miss_addr, block_index);
             // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
-            DPRINTF(MPU, "%s: Pushed cache line[%d][%d] to "
+            DPRINTF(CoalesceEngine,  "%s: Pushed cache line[%d][%d] to "
                     "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
@@ -477,7 +476,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // End of the said block
 
             servicedIndices.push_back(i);
-            DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
+            DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cache line[%d] for "
                         "removal.\n", __func__, i, block_index);
         }
     }
@@ -490,7 +489,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         MSHRMap[block_index].erase(MSHRMap[block_index].begin() +
                                     servicedIndices[i] - bias);
         bias++;
-        DPRINTF(MPU, "%s: Addr: %lu has been serviced and is removed.\n",
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu has been serviced and is removed.\n",
                     __func__, print_addr);
     }
 
@@ -517,7 +516,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
-    DPRINTF(MPU, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
+    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
                 __func__, wl.to_string(), addr);
     assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
@@ -529,17 +528,17 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
-    DPRINTF(MPU, "%s: Wrote to cache line[%d][%d] = %s.\n",
+    DPRINTF(CoalesceEngine,  "%s: Wrote to cache line[%d][%d] = %s.\n",
                 __func__, block_index, wl_offset,
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
     if ((cacheBlocks[block_index].busyMask == 0)) {
-        DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
+        DPRINTF(CoalesceEngine,  "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         applyQueue.push_back(block_index);
-        DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
+        DPRINTF(CoalesceEngine,  "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
                 __func__, block_index, applyQueue.size());
     }
 
@@ -556,12 +555,12 @@ CoalesceEngine::processNextApplyEvent()
     int block_index = applyQueue.front();
 
     if (cacheBlocks[block_index].busyMask) {
-        DPRINTF(MPU, "%s: cache line [%d] has been taken amid apply process. "
+        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has been taken amid apply process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseApplySchedules++;
     } else if (!cacheBlocks[block_index].dirty) {
-        DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply "
+        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has no change. Therefore, no apply "
                     "needed.\n", __func__, block_index);
     } else {
         for (int i = 0; i < numElementsPerLine; i++) {
@@ -596,7 +595,7 @@ CoalesceEngine::processNextApplyEvent()
     // TODO: This is where eviction policy goes
     if (cacheBlocks[block_index].hasConflict){
         evictQueue.push_back(block_index);
-        DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
+        DPRINTF(CoalesceEngine,  "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
                 __func__, block_index, evictQueue.size());
     }
 
@@ -621,7 +620,7 @@ CoalesceEngine::processNextEvictEvent()
 
     if ((cacheBlocks[block_index].busyMask) ||
         (applyQueue.find(block_index))) {
-        DPRINTF(MPU, "%s: cache line [%d] has been taken amid evict process. "
+        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has been taken amid evict process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseEvictSchedules++;
@@ -630,7 +629,7 @@ CoalesceEngine::processNextEvictEvent()
                         (cacheBlocks[block_index].hasConflict ? 2 : 1) :
                         (cacheBlocks[block_index].hasConflict ? 1 : 0);
         if (!allocateMemQueueSpace(space_needed)) {
-            DPRINTF(MPU, "%s: There is not enough space in memReqQueue to "
+            DPRINTF(CoalesceEngine,  "%s: There is not enough space in memReqQueue to "
                     "procees the eviction of cache line [%d]. dirty: %d, "
                     "hasConflict: %d.\n", __func__, block_index,
                     cacheBlocks[block_index].dirty,
@@ -639,12 +638,12 @@ CoalesceEngine::processNextEvictEvent()
             return;
         } else {
             if (cacheBlocks[block_index].dirty) {
-                DPRINTF(MPU, "%s: Change observed on cache line [%d].\n",
+                DPRINTF(CoalesceEngine,  "%s: Change observed on cache line [%d].\n",
                             __func__, block_index);
                 PacketPtr write_pkt = createWritePacket(
                     cacheBlocks[block_index].addr, peerMemoryAtomSize,
                     (uint8_t*) cacheBlocks[block_index].items);
-                DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, "
+                DPRINTF(CoalesceEngine,  "%s: Created a write packet to Addr: %lu, "
                             "size = %d.\n", __func__,
                             write_pkt->getAddr(), write_pkt->getSize());
                 enqueueMemReq(write_pkt);
@@ -653,7 +652,7 @@ CoalesceEngine::processNextEvictEvent()
             if (cacheBlocks[block_index].hasConflict) {
                 assert(!MSHRMap[block_index].empty());
                 Addr miss_addr = MSHRMap[block_index].front();
-                DPRINTF(MPU, "%s: First conflicting address for cache line[%d]"
+                DPRINTF(CoalesceEngine,  "%s: First conflicting address for cache line[%d]"
                         " is Addr: %lu.\n", __func__, block_index, miss_addr);
 
                 Addr aligned_miss_addr =
@@ -661,7 +660,7 @@ CoalesceEngine::processNextEvictEvent()
 
                 PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
                                                         peerMemoryAtomSize);
-                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                DPRINTF(CoalesceEngine,  "%s: Created a read packet for Addr: %lu."
                             " req addr (aligned_addr) = %lu, size = %d.\n",
                             __func__, miss_addr,
                             read_pkt->getAddr(), read_pkt->getSize());
@@ -673,7 +672,7 @@ CoalesceEngine::processNextEvictEvent()
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = true;
                 cacheBlocks[block_index].dirty = false;
-                DPRINTF(MPU, "%s: Allocated cache line [%d] for Addr: %lu.\n",
+                DPRINTF(CoalesceEngine,  "%s: Allocated cache line [%d] for Addr: %lu.\n",
                             __func__, block_index, aligned_miss_addr);
             } else {
 
@@ -683,7 +682,7 @@ CoalesceEngine::processNextEvictEvent()
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = false;
                 cacheBlocks[block_index].dirty = false;
-                DPRINTF(MPU, "%s: Deallocated cache line [%d].\n",
+                DPRINTF(CoalesceEngine,  "%s: Deallocated cache line [%d].\n",
                             __func__, block_index);
             }
         }
@@ -709,7 +708,7 @@ CoalesceEngine::recvPushRetry()
 void
 CoalesceEngine::processNextSendRetryEvent()
 {
-    DPRINTF(MPU, "%s: Received a push retry.\n", __func__);
+    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     Addr block_addr = 0;
     int block_index = 0;
     int it = 0;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index e87f4d275e..f17619942b 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -29,7 +29,6 @@
 #include "accl/graph/sega/push_engine.hh"
 
 #include "accl/graph/sega/coalesce_engine.hh"
-#include "debug/MPU.hh"
 #include "debug/PushEngine.hh"
 #include "mem/packet_access.hh"
 
@@ -91,14 +90,14 @@ PushEngine::ReqPort::recvReqRetry()
 {
     panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
 
-    DPRINTF(MPU, "%s: Received a reqRetry.\n", __func__);
+    DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__);
 
     _blocked = false;
     sendPacket(blockedPacket);
 
     if (!_blocked) {
         blockedPacket = nullptr;
-        DPRINTF(MPU, "%s: Sent the blockedPacket. "
+        DPRINTF(PushEngine, "%s: Sent the blockedPacket. "
                     "_blocked: %s, (blockedPacket == nullptr): %s.\n",
                     __func__, _blocked ? "true" : "false",
                     (blockedPacket == nullptr) ? "true" : "false");
@@ -273,7 +272,7 @@ PushEngine::processNextPushEvent()
     assert(offset < peerMemoryAtomSize);
     uint32_t value = reqValueMap[pkt->req];
 
-    DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
+    DPRINTF(PushEngine, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
                 "offset: %lu\n",
             __func__, pkt->getAddr(), offset);
 
@@ -287,7 +286,7 @@ PushEngine::processNextPushEvent()
     if (!reqPort.blocked()) {
         reqPort.sendPacket(update);
         stats.numUpdates++;
-        DPRINTF(MPU, "%s: Sent a push update to addr: %lu with value: %d.\n",
+        DPRINTF(PushEngine, "%s: Sent a push update to addr: %lu with value: %d.\n",
                                 __func__, curr_edge->neighbor, update_value);
         reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
         assert(reqOffsetMap[pkt->req] <= peerMemoryAtomSize);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 27ba5c40c8..9d4fb9cbe9 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -28,7 +28,7 @@
 
 #include "accl/graph/sega/wl_engine.hh"
 
-#include "debug/MPU.hh"
+#include "debug/WLEngine.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -73,7 +73,7 @@ void
 WLEngine::RespPort::checkRetryReq()
 {
     if (needSendRetryReq) {
-        DPRINTF(MPU, "%s: Sending a RetryReq.\n", __func__);
+        DPRINTF(WLEngine,  "%s: Sending a RetryReq.\n", __func__);
         sendRetryReq();
         needSendRetryReq = false;
     }
@@ -129,45 +129,38 @@ WLEngine::processNextReadEvent()
     uint32_t update_value;
     std::tie(update_addr, update_value) = updateQueue.front();
 
-    DPRINTF(MPU, "%s: Looking at the front of the updateQueue. Addr: %lu, "
+    DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. Addr: %lu, "
                 "value: %u.\n", __func__, update_addr, update_value);
 
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
-        DPRINTF(MPU, "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
+        DPRINTF(WLEngine,  "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
                     __func__, update_addr);
         if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
-            DPRINTF(MPU, "%s: Entry available in onTheFlyUpdateMap. "
-                        "onTheFlyUpdateMap.size: %lu.\n",
-                        __func__, onTheFlyUpdateMap.size());
-            if (coalesceEngine->recvWLRead(update_addr)) {
+            if (coalesceEngine->recvReadAddr(update_addr)) {
                 onTheFlyUpdateMap[update_addr] = update_value;
-                DPRINTF(MPU, "%s: Added a new item to onTheFlyUpdateMap. "
+                DPRINTF(WLEngine,  "%s: Added a new item to onTheFlyUpdateMap. "
                             "onTheFlyUpdateMap[%lu] = %u.\n", __func__,
                             update_addr, onTheFlyUpdateMap[update_addr]);
                 updateQueue.pop_front();
-                DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
+                DPRINTF(WLEngine,  "%s: Popped an item from the front of updateQueue"
                             ". updateQueue.size = %u.\n",
                             __func__, updateQueue.size());
                 respPort.checkRetryReq();
             }
-        } else {
-            DPRINTF(MPU, "%s: No entries available in onTheFlyUpdateMap. "
-                        "onTheFlyUpdateMap.size: %lu.\n", __func__,
-                        onTheFlyUpdateMap.size());
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
-        DPRINTF(MPU, "%s: Found the addr: %lu in onTheFlyUpdateMap. "
+        DPRINTF(WLEngine,  "%s: Found the addr: %lu in onTheFlyUpdateMap. "
                     "onTheFlyUpdateMap[%lu] = %u.\n", __func__, update_addr,
                     update_addr, onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
                 std::min(update_value, onTheFlyUpdateMap[update_addr]);
-        DPRINTF(MPU, "%s: Reduced the update_value with the entry in "
+        DPRINTF(WLEngine,  "%s: Reduced the update_value with the entry in "
                     "onTheFlyUpdateMap. onTheFlyUpdateMap[%lu] = %u.\n",
                     __func__, update_addr, onTheFlyUpdateMap[update_addr]);
         stats.onTheFlyCoalesce++;
         updateQueue.pop_front();
-        DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
+        DPRINTF(WLEngine,  "%s: Popped an item from the front of updateQueue"
                                         ". updateQueue.size = %u.\n",
                                         __func__, updateQueue.size());
         respPort.checkRetryReq();
@@ -185,7 +178,7 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
     assert(addrWorkListMap.size() <= onTheFlyUpdateMapSize);
 
     addrWorkListMap[addr] = wl;
-    DPRINTF(MPU, "%s: Received a WorkListItem from the coalesceEngine. Adding"
+    DPRINTF(WLEngine,  "%s: Received a WorkListItem from the coalesceEngine. Adding"
                 " it to the addrWorkListMap. addrWorkListMap[%lu] = %s.\n",
                 __func__, addr, wl.to_string());
 
@@ -202,7 +195,7 @@ WLEngine::processNextReduceEvent()
         Addr addr = it.first;
         assert(onTheFlyUpdateMap.find(addr) != onTheFlyUpdateMap.end());
         uint32_t update_value = onTheFlyUpdateMap[addr];
-        DPRINTF(MPU, "%s: Reducing between onTheFlyUpdateMap and "
+        DPRINTF(WLEngine,  "%s: Reducing between onTheFlyUpdateMap and "
                     "addrWorkListMap values. onTheFlyUpdateMap[%lu] = %u, "
                     "addrWorkListMap[%lu] = %s.\n", __func__,
                                 addr, onTheFlyUpdateMap[addr],
@@ -210,15 +203,14 @@ WLEngine::processNextReduceEvent()
         // TODO: Generalize this to reduce function rather than just min
         addrWorkListMap[addr].tempProp =
                     std::min(update_value, addrWorkListMap[addr].tempProp);
-        DPRINTF(MPU, "%s: Reduction done. addrWorkListMap[%lu] = %s.\n",
+        DPRINTF(WLEngine,  "%s: Reduction done. addrWorkListMap[%lu] = %s.\n",
                     __func__, addr, addrWorkListMap[addr].to_string());
         stats.numReduce++;
 
         coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]);
         onTheFlyUpdateMap.erase(addr);
-        DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap. "
-                    "onTheFlyUpdateMap.size: %lu.\n",
-                    __func__, addr, onTheFlyUpdateMap.size());
+        DPRINTF(WLEngine,  "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
+                    __func__, addr);
     }
     addrWorkListMap.clear();
 }
@@ -231,8 +223,12 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
         return false;
     }
 
+    if (curTick() == ) {
+        std
+    }
+
     updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
-    DPRINTF(MPU, "%s: Pushed an item to the back of updateQueue"
+    DPRINTF(WLEngine,  "%s: Pushed an item to the back of updateQueue"
                                         ". updateQueue.size = %u.\n",
                                         __func__, updateQueue.size());
     delete pkt;

From be1246d97085c07ab86fc888111b9cdb8b6b30ea Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 20 Jul 2022 16:11:12 -0700
Subject: [PATCH 117/247] Removing accidentally commented out wrong code.

---
 src/accl/graph/sega/wl_engine.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 9d4fb9cbe9..70a921c48a 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -223,10 +223,6 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
         return false;
     }
 
-    if (curTick() == ) {
-        std
-    }
-
     updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
     DPRINTF(WLEngine,  "%s: Pushed an item to the back of updateQueue"
                                         ". updateQueue.size = %u.\n",

From c9458f184ad39f8f147bb18a9f3e29f2ecb90ec1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 21 Jul 2022 14:23:35 -0700
Subject: [PATCH 118/247] Adding in between counter for retry.

---
 src/accl/graph/sega/push_engine.cc | 59 +++++++++++++++++++++---------
 src/accl/graph/sega/push_engine.hh |  5 ++-
 src/accl/graph/sega/wl_engine.cc   |  2 +-
 3 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index f17619942b..0c2b3deb3f 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -37,11 +37,10 @@ namespace gem5
 
 PushEngine::PushEngine(const PushEngineParams &params):
     BaseMemEngine(params),
-    retrySpaceAllocated(0),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
-    numRetries(0),
     pushReqQueueSize(params.push_req_queue_size),
+    numRetries(0), retrySpaceAllocated(0), spacesAllocatedBetweenRetries(0),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
     stats(*this)
@@ -118,16 +117,28 @@ PushEngine::deallocatePushSpace(int space)
     /// DISCUSS: Might have to check whether the addrGenEvent is scheduled
     // and or the pushReqQueue is empty. If so we might need to
     // send retries.
-    if ((numRetries > 0)  &&
-        ((pushReqQueue.size() + retrySpaceAllocated) == 0)) {
-        assert((!pendingMemRetry()) && (!nextAddrGenEvent.scheduled()));
+    // if ((numRetries > 0)  &&
+    //     ((pushReqQueue.size() + retrySpaceAllocated) == 0)) {
+    //     assert((!pendingMemRetry()) && (!nextAddrGenEvent.scheduled()));
+    //     int free_space =
+    //         pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
+    //     if (free_space > numElementsPerLine) {
+    //         DPRINTF(PushEngine, "%s: Found %d free spaces. "
+    //                 "retrySpaceAllocated = %d.\n", __func__, free_space,
+    //                 retrySpaceAllocated);
+    //         retrySpaceAllocated += numElementsPerLine;
+    //         peerCoalesceEngine->recvPushRetry();
+    //     }
+    // }
+
+    if (numRetries > 0) {
         int free_space =
             pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
-        if (free_space > numElementsPerLine) {
-            DPRINTF(PushEngine, "%s: Found %d free spaces. "
-                    "retrySpaceAllocated = %d.\n", __func__, free_space,
-                    retrySpaceAllocated);
-            retrySpaceAllocated += numElementsPerLine;
+        assert(free_space <= numElementsPerLine);
+        retrySpaceAllocated += free_space;
+        spacesAllocatedBetweenRetries += free_space;
+        if (spacesAllocatedBetweenRetries >= numElementsPerLine) {
+            spacesAllocatedBetweenRetries %= numElementsPerLine;
             peerCoalesceEngine->recvPushRetry();
         }
     }
@@ -214,15 +225,26 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
+        // if (numRetries > 0) {
+        //     int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
+        //     DPRINTF(PushEngine, "%s: Found %d free spaces in "
+        //                     "the pushReqQueue.\n", __func__, free_space);
+        //     if (free_space > numElementsPerLine) {
+        //         retrySpaceAllocated += numElementsPerLine;
+        //         DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. "
+        //                 "retrySpaceAllocated = %d.\n", __func__,
+        //                 numElementsPerLine, retrySpaceAllocated);
+        //         peerCoalesceEngine->recvPushRetry();
+        //     }
+        // }
+
         if (numRetries > 0) {
-            int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
-            DPRINTF(PushEngine, "%s: Found %d free spaces in "
-                            "the pushReqQueue.\n", __func__, free_space);
-            if (free_space > numElementsPerLine) {
-                retrySpaceAllocated += numElementsPerLine;
-                DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. "
-                        "retrySpaceAllocated = %d.\n", __func__,
-                        numElementsPerLine, retrySpaceAllocated);
+            retrySpaceAllocated++;
+            DPRINTF(PushEngine, "%s: Allocated one space for retry. "
+                "retrySpaceAllocated = %d.\n", __func__, retrySpaceAllocated);
+            spacesAllocatedBetweenRetries++;
+            if (spacesAllocatedBetweenRetries == numElementsPerLine) {
+                spacesAllocatedBetweenRetries = 0;
                 peerCoalesceEngine->recvPushRetry();
             }
         }
@@ -331,6 +353,7 @@ PushEngine::allocatePushSpace() {
     assert(retrySpaceAllocated >= 0);
     if ((pushReqQueueSize == 0) ||
         ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
+        assert(numRetries == 0);
         return true;
     } else {
         numRetries++;
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 9025ae9946..cd79139bbc 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -99,15 +99,16 @@ class PushEngine : public BaseMemEngine
     };
 
     int numElementsPerLine;
-    int retrySpaceAllocated;
     CoalesceEngine* peerCoalesceEngine;
 
     ReqPort reqPort;
 
     Addr baseEdgeAddr;
 
-    int numRetries;
     int pushReqQueueSize;
+    int numRetries;
+    int retrySpaceAllocated;
+    int spacesAllocatedBetweenRetries;
     std::deque<PushPacketInfoGen> pushReqQueue;
 
     // TODO: Add size one size for all these maps
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 70a921c48a..79bf046ba3 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -136,7 +136,7 @@ WLEngine::processNextReadEvent()
         DPRINTF(WLEngine,  "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
                     __func__, update_addr);
         if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
-            if (coalesceEngine->recvReadAddr(update_addr)) {
+            if (coalesceEngine->recvWLRead(update_addr)) {
                 onTheFlyUpdateMap[update_addr] = update_value;
                 DPRINTF(WLEngine,  "%s: Added a new item to onTheFlyUpdateMap. "
                             "onTheFlyUpdateMap[%lu] = %u.\n", __func__,

From cb3169882f5dd404f87f533f104d1fa346da30f1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 21 Jul 2022 23:24:32 -0700
Subject: [PATCH 119/247] Fixing the retry mechanism.

---
 src/accl/graph/sega/coalesce_engine.cc | 21 ++++--
 src/accl/graph/sega/push_engine.cc     | 89 +++++++++-----------------
 src/accl/graph/sega/push_engine.hh     |  9 ++-
 3 files changed, 55 insertions(+), 64 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 522feebace..b3167a0e95 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -354,6 +354,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 // It is not busy anymore, we have to send the wl from cache.
                 DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
+                assert(peerPushEngine->getNumRetries() == needsPush.count());
                 for (int i = 0; i < numElementsPerLine; i++) {
                     assert(!((needsPush[it + i] == 1) &&
                             (cacheBlocks[block_index].items[i].degree == 0)));
@@ -374,6 +375,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                 __func__, needsPush.count());
                 peerPushEngine->deallocatePushSpace(
                                         numElementsPerLine - push_needed);
+                assert(peerPushEngine->getNumRetries() == needsPush.count());
                 // Since we have just applied the line, we can take it out of
                 // the applyQueue if it's in there. No need to do the same
                 // thing for evictQueue.
@@ -402,7 +404,9 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         __func__, addr);
                 DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
+                assert(peerPushEngine->getNumRetries() == needsPush.count());
                 peerPushEngine->deallocatePushSpace(numElementsPerLine);
+                assert(peerPushEngine->getNumRetries() == needsPush.count());
                 DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
             }
@@ -417,6 +421,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // No applying of the line needed.
             DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
+            assert(peerPushEngine->getNumRetries() == needsPush.count());
             for (int i = 0; i < numElementsPerLine; i++) {
                 assert(!((needsPush[it + i] == 1) &&
                                 (items[i].degree == 0)));
@@ -430,6 +435,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                 __func__, needsPush.count());
             peerPushEngine->deallocatePushSpace(
                                     numElementsPerLine - push_needed);
+            assert(peerPushEngine->getNumRetries() == needsPush.count());
         }
 
         delete pkt;
@@ -708,6 +714,13 @@ CoalesceEngine::recvPushRetry()
 void
 CoalesceEngine::processNextSendRetryEvent()
 {
+    if (needsPush.count() == 0) {
+        DPRINTF(CoalesceEngine, "%s: Received a retry while there are no set "
+                        "bit in needsPush. Rejecting the retry.\n", __func__);
+        peerPushEngine->recvRetryReject();
+        return;
+    }
+
     DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     Addr block_addr = 0;
     int block_index = 0;
@@ -715,7 +728,8 @@ CoalesceEngine::processNextSendRetryEvent()
     uint32_t slice = 0;
     bool hit_in_cache = false;
 
-    for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
+    for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE;
+        it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE) {
         for (int i = 0; i < numElementsPerLine; i++) {
             slice <<= 1;
             slice |= needsPush[it + i];
@@ -734,9 +748,6 @@ CoalesceEngine::processNextSendRetryEvent()
                 break;
             }
         }
-        if (it == (MAX_BITVECTOR_SIZE - numElementsPerLine)) {
-            it = 0;
-        }
     }
 
     assert(it < MAX_BITVECTOR_SIZE);
@@ -753,6 +764,7 @@ CoalesceEngine::processNextSendRetryEvent()
         int push_needed = 0;
         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
+        assert(peerPushEngine->getNumRetries() == needsPush.count());
         for (int i = 0; i < numElementsPerLine; i++) {
             // TODO: Make this more programmable
             uint32_t new_prop = std::min(
@@ -770,6 +782,7 @@ CoalesceEngine::processNextSendRetryEvent()
         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
         peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
+        assert(peerPushEngine->getNumRetries() == needsPush.count());
         if (applyQueue.find(block_index)) {
             applyQueue.erase(block_index);
             if (applyQueue.empty() && nextApplyEvent.scheduled()) {
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 0c2b3deb3f..6db91734fe 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -40,7 +40,7 @@ PushEngine::PushEngine(const PushEngineParams &params):
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
-    numRetries(0), retrySpaceAllocated(0), spacesAllocatedBetweenRetries(0),
+    numTotalRetries(0), numPendingRetries(0),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
     stats(*this)
@@ -106,39 +106,22 @@ PushEngine::ReqPort::recvReqRetry()
 void
 PushEngine::deallocatePushSpace(int space)
 {
-    retrySpaceAllocated -= space;
-    DPRINTF(PushEngine, "%s: Deallocated %d spaces. numRetries = %d, "
-            "nextAddrGenEvent.scheduled() = %s, pendingMemRetry() = %s, "
-            "pushReqQueue.size() = %d, retrySpaceAllocated = %d.\n",
-            __func__, space, numRetries,
-            nextAddrGenEvent.scheduled() ? "true" : "false",
-            pendingMemRetry() ? "true" : "false",
-            pushReqQueue.size(), retrySpaceAllocated);
     /// DISCUSS: Might have to check whether the addrGenEvent is scheduled
     // and or the pushReqQueue is empty. If so we might need to
     // send retries.
-    // if ((numRetries > 0)  &&
-    //     ((pushReqQueue.size() + retrySpaceAllocated) == 0)) {
-    //     assert((!pendingMemRetry()) && (!nextAddrGenEvent.scheduled()));
-    //     int free_space =
-    //         pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
-    //     if (free_space > numElementsPerLine) {
-    //         DPRINTF(PushEngine, "%s: Found %d free spaces. "
-    //                 "retrySpaceAllocated = %d.\n", __func__, free_space,
-    //                 retrySpaceAllocated);
-    //         retrySpaceAllocated += numElementsPerLine;
-    //         peerCoalesceEngine->recvPushRetry();
-    //     }
-    // }
-
-    if (numRetries > 0) {
-        int free_space =
-            pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
-        assert(free_space <= numElementsPerLine);
-        retrySpaceAllocated += free_space;
-        spacesAllocatedBetweenRetries += free_space;
-        if (spacesAllocatedBetweenRetries >= numElementsPerLine) {
-            spacesAllocatedBetweenRetries %= numElementsPerLine;
+    DPRINTF(PushEngine, "%s: Received reported %d free spaces.\n",
+                                                __func__, space);
+    numPendingRetries--;
+    if (numTotalRetries > 0) {
+        int free_space = pushReqQueueSize -
+            (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
+        DPRINTF(PushEngine, "%s: pushReqQueue has at least %d "
+                            "free spaces.\n", __func__, free_space);
+        if ((free_space > numElementsPerLine) &&
+            (numTotalRetries >= numPendingRetries)) {
+            DPRINTF(PushEngine, "%s: Sent a push retry to "
+                            "peerCoalesceEngine.\n", __func__);
+            numPendingRetries++;
             peerCoalesceEngine->recvPushRetry();
         }
     }
@@ -162,6 +145,8 @@ PushEngine::recvWLItem(WorkListItem wl)
 
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                                     peerMemoryAtomSize, value);
+    DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
+                            __func__, pushReqQueue.size());
 
     if ((!nextAddrGenEvent.scheduled())) {
         if (memQueueFull()) {
@@ -187,8 +172,10 @@ PushEngine::recvWLItemRetry(WorkListItem wl)
 
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                                     peerMemoryAtomSize, value);
-    numRetries--;
-    retrySpaceAllocated--;
+    DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
+                            __func__, pushReqQueue.size());
+
+    numTotalRetries--;
     if ((!nextAddrGenEvent.scheduled())) {
         if (memQueueFull()) {
             if (!pendingMemRetry()) {
@@ -225,26 +212,16 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
-        // if (numRetries > 0) {
-        //     int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
-        //     DPRINTF(PushEngine, "%s: Found %d free spaces in "
-        //                     "the pushReqQueue.\n", __func__, free_space);
-        //     if (free_space > numElementsPerLine) {
-        //         retrySpaceAllocated += numElementsPerLine;
-        //         DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. "
-        //                 "retrySpaceAllocated = %d.\n", __func__,
-        //                 numElementsPerLine, retrySpaceAllocated);
-        //         peerCoalesceEngine->recvPushRetry();
-        //     }
-        // }
-
-        if (numRetries > 0) {
-            retrySpaceAllocated++;
-            DPRINTF(PushEngine, "%s: Allocated one space for retry. "
-                "retrySpaceAllocated = %d.\n", __func__, retrySpaceAllocated);
-            spacesAllocatedBetweenRetries++;
-            if (spacesAllocatedBetweenRetries == numElementsPerLine) {
-                spacesAllocatedBetweenRetries = 0;
+        if (numTotalRetries > 0) {
+            int free_space = pushReqQueueSize -
+            (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
+            DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
+                        "free spaces.\n", __func__, free_space);
+            if ((free_space > numElementsPerLine) &&
+                (numTotalRetries >= numPendingRetries)) {
+                DPRINTF(PushEngine, "%s: Sent a push retry to "
+                            "peerCoalesceEngine.\n", __func__);
+                numPendingRetries++;
                 peerCoalesceEngine->recvPushRetry();
             }
         }
@@ -350,13 +327,11 @@ PushEngine::createUpdatePacket(Addr addr, T value)
 
 bool
 PushEngine::allocatePushSpace() {
-    assert(retrySpaceAllocated >= 0);
     if ((pushReqQueueSize == 0) ||
-        ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
-        assert(numRetries == 0);
+        ((pushReqQueue.size() < pushReqQueueSize) && (numTotalRetries == 0))) {
         return true;
     } else {
-        numRetries++;
+        numTotalRetries++;
         return false;
     }
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index cd79139bbc..a3a308554f 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -106,9 +106,8 @@ class PushEngine : public BaseMemEngine
     Addr baseEdgeAddr;
 
     int pushReqQueueSize;
-    int numRetries;
-    int retrySpaceAllocated;
-    int spacesAllocatedBetweenRetries;
+    int numTotalRetries;
+    int numPendingRetries;
     std::deque<PushPacketInfoGen> pushReqQueue;
 
     // TODO: Add size one size for all these maps
@@ -164,6 +163,10 @@ class PushEngine : public BaseMemEngine
 
     void registerCoalesceEngine(CoalesceEngine* coalesce_engine,
                                           int elements_per_line);
+
+    int getNumRetries() { return numTotalRetries; }
+
+    void recvRetryReject() { numPendingRetries--; }
 };
 
 }

From c03a23a38717d7dd123bb92b0a55bb048e53545f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 23 Jul 2022 15:59:31 -0700
Subject: [PATCH 120/247] Limiting retries to one.

---
 src/accl/graph/sega/push_engine.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 6db91734fe..ab2962b253 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -118,7 +118,7 @@ PushEngine::deallocatePushSpace(int space)
         DPRINTF(PushEngine, "%s: pushReqQueue has at least %d "
                             "free spaces.\n", __func__, free_space);
         if ((free_space > numElementsPerLine) &&
-            (numTotalRetries >= numPendingRetries)) {
+            (numPendingRetries == 0)) {
             DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
             numPendingRetries++;
@@ -218,7 +218,7 @@ PushEngine::processNextAddrGenEvent()
             DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
                         "free spaces.\n", __func__, free_space);
             if ((free_space > numElementsPerLine) &&
-                (numTotalRetries >= numPendingRetries)) {
+                (numPendingRetries == 0)) {
                 DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
                 numPendingRetries++;

From dcfaab330d517c1b02c8aaa882336698d1a29de6 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 23 Jul 2022 17:28:51 -0700
Subject: [PATCH 121/247] Adding MemoryEvent class and nextReadOnMissEvent.

---
 src/accl/graph/sega/coalesce_engine.cc | 42 +++++++++++++++++++++-----
 src/accl/graph/sega/coalesce_engine.hh | 21 +++++++++++--
 2 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b3167a0e95..033c1f3363 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -48,6 +48,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numRetriesReceived(0),
     applyQueue(numLines),
     evictQueue(numLines),
+    nextReadOnMissEvent([this] { processNextReadOnMissEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
     nextEvictEvent([this] { processNextEvictEvent(); }, name()),
@@ -175,7 +176,6 @@ CoalesceEngine::recvWLRead(Addr addr)
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         stats.readHits++;
 
-        assert(!responseQueue.empty());
         if (!nextRespondEvent.scheduled()) {
             schedule(nextRespondEvent, nextCycle());
         }
@@ -233,9 +233,9 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. Trying to "
                                 "allocate a cache line for it.\n",
                                 __func__, addr);
-                    if (memQueueFull()) {
-                        DPRINTF(CoalesceEngine,  "%s: No space in outstandingMemReqQueue. "
-                                    "Rejecting  request.\n", __func__);
+                    if (lineFillBuffer.size() == numMSHREntry) {
+                        DPRINTF(CoalesceEngine,  "%s: No space left in "
+                            "lineFillBuffer. Rejecting  request.\n", __func__);
                         stats.readRejections++;
                         return false;
                     }
@@ -255,9 +255,15 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine,  "%s: Created a read packet for Addr: %lu."
                                 " req addr (aligned_addr) = %lu, size = %d.\n",
                                 __func__, addr, aligned_addr, peerMemoryAtomSize);
-                    enqueueMemReq(pkt);
-                    DPRINTF(CoalesceEngine,  "%s: Pushed pkt to outstandingMemReqQueue.\n",
-                                                                    __func__);
+                    // enqueueMemReq(pkt);
+                    lineFillBuffer.push_back(pkt);
+                    DPRINTF(CoalesceEngine,  "%s: Pushed pkt to "
+                            "lineFillBuffer. lineFillBuffer.size = %d.\n",
+                            __func__, lineFillBuffer.size());
+                    if ((!nextReadOnMissEvent.pending()) &&
+                        (!nextReadOnMissEvent.scheduled())) {
+                        schedule(nextReadOnMissEvent, nextCycle());
+                    }
                     stats.readMisses++;
                     stats.numVertexReads++;
                     return true;
@@ -296,6 +302,28 @@ CoalesceEngine::recvWLRead(Addr addr)
     }
 }
 
+void
+CoalesceEngine::processNextReadOnMissEvent()
+{
+    if (memQueueFull()) {
+        nextReadOnMissEvent.sleep();
+        // TODO: Implement interface where events of the CoalesceEngine are
+        // pushed to a fifo to be scheduled later.
+        return;
+    }
+
+    PacketPtr pkt = lineFillBuffer.front();
+    enqueueMemReq(pkt);
+
+    lineFillBuffer.pop_front();
+
+    if (!lineFillBuffer.empty()) {
+        assert(!nextReadOnMissEvent.scheduled());
+        assert(!nextReadOnMissEvent.pending());
+        schedule(nextReadOnMissEvent, nextCycle());
+    }
+}
+
 // TODO: For loop to empty the entire responseQueue.
 void
 CoalesceEngine::processNextRespondEvent()
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index e1033a4622..05fa555ec8 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -49,6 +49,20 @@ class WLEngine;
 class CoalesceEngine : public BaseMemEngine
 {
   private:
+    class MemoryEvent : public EventFunctionWrapper
+    {
+      private:
+        bool _pending;
+      public:
+        MemoryEvent(const std::function<void(void)> &callback,
+                    const std::string &name):
+            EventFunctionWrapper(callback, name), _pending(false)
+        {}
+        bool pending() { return _pending; }
+        void sleep() { _pending = true; }
+        void wake() { _pending = false; }
+    };
+
     struct Block
     {
         WorkListItem* items;
@@ -93,7 +107,7 @@ class CoalesceEngine : public BaseMemEngine
     int numMSHREntry;
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHRMap;
-
+    std::deque<PacketPtr> lineFillBuffer;
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     int currentBitSliceIndex;
@@ -107,13 +121,16 @@ class CoalesceEngine : public BaseMemEngine
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
 
+    MemoryEvent nextReadOnMissEvent;
+    void processNextReadOnMissEvent();
+
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
 
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
-    EventFunctionWrapper nextEvictEvent;
+    MemoryEvent nextEvictEvent;
     void processNextEvictEvent();
 
     EventFunctionWrapper nextSendRetryEvent;

From 7db47e2a89611412310f3f50e32df6433a429af4 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 23 Jul 2022 22:04:08 -0700
Subject: [PATCH 122/247] Restructuring events and adding nextWriteBackEvent.

---
 src/accl/graph/base/data_structs.hh    |   4 +-
 src/accl/graph/sega/coalesce_engine.cc | 290 ++++++++++++-------------
 src/accl/graph/sega/coalesce_engine.hh |  21 +-
 src/accl/graph/sega/push_engine.cc     |   4 +-
 4 files changed, 153 insertions(+), 166 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index f938be72f1..f178d5a7e2 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -90,13 +90,13 @@ static_assert(isPowerOf2(sizeof(WorkListItem)));
 static_assert(isPowerOf2(sizeof(Edge)));
 
 template<typename T>
-class FIFOSet
+class InOutSet
 {
   private:
     std::unordered_set<T> set;
 
   public:
-    FIFOSet(int cap)
+    InOutSet(int cap)
     {
         set.reserve(cap);
     }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 033c1f3363..ddbd22a8b5 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -42,16 +42,17 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    numMSHREntry(params.num_mshr_entry),
+    numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     currentBitSliceIndex(0),
     numRetriesReceived(0),
     applyQueue(numLines),
-    evictQueue(numLines),
-    nextReadOnMissEvent([this] { processNextReadOnMissEvent(); }, name()),
+    writeBackQueue(numLines),
+    replaceQueue(numLines),
+    nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
-    nextEvictEvent([this] { processNextEvictEvent(); }, name()),
+    nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()),
     nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
     stats(*this)
 {
@@ -149,7 +150,7 @@ CoalesceEngine::getBlockAddrFromBitIndex(int index)
 bool
 CoalesceEngine::recvWLRead(Addr addr)
 {
-    assert(MSHRMap.size() <= numMSHREntry);
+    assert(MSHR.size() <= numMSHREntries);
     DPRINTF(CoalesceEngine,  "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
     Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
@@ -184,11 +185,11 @@ CoalesceEngine::recvWLRead(Addr addr)
     } else {
         // miss
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
-        if (MSHRMap.find(block_index) == MSHRMap.end()) {
-            DPRINTF(CoalesceEngine,  "%s: Respective cache line[%d] for Addr: %lu not "
+        if (MSHR.find(block_index) == MSHR.end()) {
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr: %lu not "
                         "found in MSHRs.\n", __func__, block_index, addr);
-            assert(MSHRMap.size() <= numMSHREntry);
-            if (MSHRMap.size() == numMSHREntry) {
+            assert(MSHR.size() <= numMSHREntries);
+            if (MSHR.size() == numMSHREntries) {
                 // Out of MSHR entries
                 DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
                             "Rejecting request.\n", __func__);
@@ -199,24 +200,26 @@ CoalesceEngine::recvWLRead(Addr addr)
             } else {
                 DPRINTF(CoalesceEngine,  "%s: MSHR entries available.\n", __func__);
                 if (cacheBlocks[block_index].allocated) {
-                    assert(MSHRMap[block_index].size() <= numTgtsPerMSHR);
+                    assert(MSHR[block_index].size() <= numTgtsPerMSHR);
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                                 "with Addr: %lu.\n", __func__, addr,
                                 cacheBlocks[block_index].addr);
-                    if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
-                        DPRINTF(CoalesceEngine,  "%s: Out of targets for cache line[%d]. "
+                    if (MSHR[block_index].size() == numTgtsPerMSHR) {
+                        DPRINTF(CoalesceEngine,  "%s: Out of targets for cacheBlocks[%d]. "
                                     "Rejecting request.\n",
                                     __func__, block_index);
                         stats.readRejections++;
                         return false;
                     }
                     cacheBlocks[block_index].hasConflict = true;
-                    MSHRMap[block_index].push_back(addr);
+                    MSHR[block_index].push_back(addr);
                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
                                 "line[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
                     stats.numVertexReads++;
-                    if (!cacheBlocks[block_index].busyMask) {
+
+                    if ((cacheBlocks[block_index].busyMask == 0) &&
+                        (cacheBlocks[block_index].valid)) {
                         applyQueue.push_back(block_index);
                         DPRINTF(CoalesceEngine,  "%s: Added %d to applyQueue. "
                                     "applyQueue.size = %u.\n", __func__,
@@ -230,39 +233,31 @@ CoalesceEngine::recvWLRead(Addr addr)
                 } else {
                     assert(!cacheBlocks[block_index].valid);
                     // MSHR available and no conflict
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. Trying to "
-                                "allocate a cache line for it.\n",
-                                __func__, addr);
-                    if (lineFillBuffer.size() == numMSHREntry) {
-                        DPRINTF(CoalesceEngine,  "%s: No space left in "
-                            "lineFillBuffer. Rejecting  request.\n", __func__);
-                        stats.readRejections++;
-                        return false;
-                    }
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
+                                            "Allocating a cache line for it.\n"
+                                                            , __func__, addr);
+
                     cacheBlocks[block_index].addr = aligned_addr;
                     cacheBlocks[block_index].busyMask = 0;
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
-                    DPRINTF(CoalesceEngine,  "%s: Allocated cache line[%d] for "
-                                "Addr: %lu.\n", __func__, block_index, addr);
+                    DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for"
+                                " Addr: %lu.\n", __func__, block_index, addr);
 
-                    MSHRMap[block_index].push_back(addr);
-                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
-                                "line[%d].\n", __func__, addr, block_index);
+                    MSHR[block_index].push_back(addr);
+                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
 
-                    PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
-                    DPRINTF(CoalesceEngine,  "%s: Created a read packet for Addr: %lu."
-                                " req addr (aligned_addr) = %lu, size = %d.\n",
-                                __func__, addr, aligned_addr, peerMemoryAtomSize);
                     // enqueueMemReq(pkt);
-                    lineFillBuffer.push_back(pkt);
-                    DPRINTF(CoalesceEngine,  "%s: Pushed pkt to "
-                            "lineFillBuffer. lineFillBuffer.size = %d.\n",
-                            __func__, lineFillBuffer.size());
-                    if ((!nextReadOnMissEvent.pending()) &&
-                        (!nextReadOnMissEvent.scheduled())) {
-                        schedule(nextReadOnMissEvent, nextCycle());
+                    fillQueue.push_back(block_index);
+                    // FIXME: Fix this DPRINTF
+                    // DPRINTF(CoalesceEngine,  "%s: Pushed pkt index  "
+                    //         "lineFillBuffer. lineFillBuffer.size = %d.\n",
+                    //         __func__, fillQueue.size());
+                    if ((!nextMemoryReadEvent.pending()) &&
+                        (!nextMemoryReadEvent.scheduled())) {
+                        schedule(nextMemoryReadEvent, nextCycle());
                     }
                     stats.readMisses++;
                     stats.numVertexReads++;
@@ -270,10 +265,10 @@ CoalesceEngine::recvWLRead(Addr addr)
                 }
             }
         } else {
-            DPRINTF(CoalesceEngine,  "%s: Respective cache line[%d] for Addr: %lu already "
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr: %lu already "
                         "in MSHRs.\n", __func__, block_index, addr);
-            if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
-                DPRINTF(CoalesceEngine,  "%s: Out of targets for cache line[%d]. "
+            if (MSHR[block_index].size() == numTgtsPerMSHR) {
+                DPRINTF(CoalesceEngine,  "%s: Out of targets for cacheBlocks[%d]. "
                             "Rejecting request.\n",
                             __func__, block_index);
                 stats.readRejections++;
@@ -293,7 +288,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                 stats.readHitUnderMisses++;
             }
 
-            MSHRMap[block_index].push_back(addr);
+            MSHR[block_index].push_back(addr);
             DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
                             "line[%d].\n", __func__, addr, block_index);
             stats.numVertexReads++;
@@ -303,24 +298,29 @@ CoalesceEngine::recvWLRead(Addr addr)
 }
 
 void
-CoalesceEngine::processNextReadOnMissEvent()
+CoalesceEngine::processNextMemoryReadEvent()
 {
     if (memQueueFull()) {
-        nextReadOnMissEvent.sleep();
+        nextMemoryReadEvent.sleep();
         // TODO: Implement interface where events of the CoalesceEngine are
         // pushed to a fifo to be scheduled later.
         return;
     }
 
-    PacketPtr pkt = lineFillBuffer.front();
+    int block_index = fillQueue.front();
+    PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
+                                    peerMemoryAtomSize);
+    DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
+            "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
+
     enqueueMemReq(pkt);
 
-    lineFillBuffer.pop_front();
+    fillQueue.pop_front();
 
-    if (!lineFillBuffer.empty()) {
-        assert(!nextReadOnMissEvent.scheduled());
-        assert(!nextReadOnMissEvent.pending());
-        schedule(nextReadOnMissEvent, nextCycle());
+    if (!fillQueue.empty()) {
+        assert(!nextMemoryReadEvent.scheduled());
+        assert(!nextMemoryReadEvent.pending());
+        schedule(nextMemoryReadEvent, nextCycle());
     }
 }
 
@@ -347,11 +347,13 @@ CoalesceEngine::processNextRespondEvent()
     }
 }
 
+// FIXME: Update this for implementing event retry interaction.
 void
 CoalesceEngine::recvMemRetry()
 {
-    assert(!nextEvictEvent.scheduled());
-    schedule(nextEvictEvent, nextCycle());
+    // assert(!nextEvictEvent.scheduled());
+    // schedule(nextEvictEvent, nextCycle());
+    return;
 }
 
 bool
@@ -413,10 +415,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         deschedule(nextApplyEvent);
                     }
                     if (cacheBlocks[block_index].hasConflict) {
-                        evictQueue.push_back(block_index);
-                        if ((!nextEvictEvent.scheduled()) &&
-                            (!pendingMemRetry())) {
-                            schedule(nextEvictEvent, nextCycle());
+                        writeBackQueue.push_back(block_index);
+                        if ((!nextWriteBackEvent.pending()) &&
+                            (!nextWriteBackEvent.scheduled())) {
+                            schedule(nextWriteBackEvent, nextCycle());
                         }
                     }
                 }
@@ -477,7 +479,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 __func__, pkt->getAddr());
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
-            (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
+            (!(MSHR.find(block_index) == MSHR.end()))); // allocated MSHR
     pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
                                                 peerMemoryAtomSize);
 
@@ -490,18 +492,18 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
     // FIXME: Get rid of servicedIndices (maybe use an iterator)
     std::vector<int> servicedIndices;
-    for (int i = 0; i < MSHRMap[block_index].size(); i++) {
-        Addr miss_addr = MSHRMap[block_index][i];
+    for (int i = 0; i < MSHR[block_index].size(); i++) {
+        Addr miss_addr = MSHR[block_index][i];
         Addr aligned_miss_addr = roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for cache line[%d] could "
+            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for cacheBlocks[%d] could "
                         "be serviced with the received packet.\n",
                         __func__, miss_addr, block_index);
             // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
-            DPRINTF(CoalesceEngine,  "%s: Pushed cache line[%d][%d] to "
+            DPRINTF(CoalesceEngine,  "%s: Pushed cacheBlocks[%d][%d] to "
                     "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
@@ -510,25 +512,25 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // End of the said block
 
             servicedIndices.push_back(i);
-            DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cache line[%d] for "
+            DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cacheBlocks[%d] for "
                         "removal.\n", __func__, i, block_index);
         }
     }
 
     // TODO: We Can use taken instead of this
-    // TODO: Change the MSHRMap from map<Addr, vector> to map<Addr, list>
+    // TODO: Change the MSHR from map<Addr, vector> to map<Addr, list>
     int bias = 0;
     for (int i = 0; i < servicedIndices.size(); i++) {
-        Addr print_addr = MSHRMap[block_index][i - bias];
-        MSHRMap[block_index].erase(MSHRMap[block_index].begin() +
+        Addr print_addr = MSHR[block_index][i - bias];
+        MSHR[block_index].erase(MSHR[block_index].begin() +
                                     servicedIndices[i] - bias);
         bias++;
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu has been serviced and is removed.\n",
                     __func__, print_addr);
     }
 
-    if (MSHRMap[block_index].empty()) {
-        MSHRMap.erase(block_index);
+    if (MSHR[block_index].empty()) {
+        MSHR.erase(block_index);
         cacheBlocks[block_index].hasConflict = false;
     } else {
         assert(cacheBlocks[block_index].hasConflict);
@@ -562,13 +564,13 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
-    DPRINTF(CoalesceEngine,  "%s: Wrote to cache line[%d][%d] = %s.\n",
+    DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
                 __func__, block_index, wl_offset,
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
     if ((cacheBlocks[block_index].busyMask == 0)) {
-        DPRINTF(CoalesceEngine,  "%s: Received all the expected writes for cache line[%d]."
+        DPRINTF(CoalesceEngine,  "%s: Received all the expected writes for cacheBlocks[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         applyQueue.push_back(block_index);
@@ -588,13 +590,13 @@ CoalesceEngine::processNextApplyEvent()
 {
     int block_index = applyQueue.front();
 
-    if (cacheBlocks[block_index].busyMask) {
-        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has been taken amid apply process. "
+    if (cacheBlocks[block_index].busyMask != 0) {
+        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid apply process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseApplySchedules++;
     } else if (!cacheBlocks[block_index].dirty) {
-        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has no change. Therefore, no apply "
+        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has no change. Therefore, no apply "
                     "needed.\n", __func__, block_index);
     } else {
         for (int i = 0; i < numElementsPerLine; i++) {
@@ -628,17 +630,17 @@ CoalesceEngine::processNextApplyEvent()
 
     // TODO: This is where eviction policy goes
     if (cacheBlocks[block_index].hasConflict){
-        evictQueue.push_back(block_index);
-        DPRINTF(CoalesceEngine,  "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
-                __func__, block_index, evictQueue.size());
+        writeBackQueue.push_back(block_index);
+        DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. writeBackQueue.size = %u.\n",
+                __func__, block_index, writeBackQueue.size());
     }
 
     applyQueue.pop_front();
 
-    if ((!evictQueue.empty()) &&
-        (!pendingMemRetry()) &&
-        (!nextEvictEvent.scheduled())) {
-        schedule(nextEvictEvent, nextCycle());
+    if ((!writeBackQueue.empty()) &&
+        (!nextWriteBackEvent.pending()) &&
+        (!nextWriteBackEvent.scheduled())) {
+        schedule(nextWriteBackEvent, nextCycle());
     }
 
     if ((!applyQueue.empty()) &&
@@ -648,85 +650,64 @@ CoalesceEngine::processNextApplyEvent()
 }
 
 void
-CoalesceEngine::processNextEvictEvent()
+CoalesceEngine::processNextWriteBackEvent()
 {
-    int block_index = evictQueue.front();
+    if (memQueueFull()) {
+        nextWriteBackEvent.sleep();
+        // TODO: Implement interface where events of the CoalesceEngine are
+        // pushed to a fifo to be scheduled later.
+        return;
+    }
 
-    if ((cacheBlocks[block_index].busyMask) ||
+    int block_index = writeBackQueue.front();
+
+    // Why would we write it back if it does not have a conflict?
+    assert(cacheBlocks[block_index].hasConflict);
+
+    if ((cacheBlocks[block_index].busyMask != 0) ||
         (applyQueue.find(block_index))) {
-        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has been taken amid evict process. "
-                    "Therefore, ignoring the apply schedule.\n",
+        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid "
+                "writeback process. Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
+        // FIXME: Fix the name of this stat.
         stats.falseEvictSchedules++;
     } else {
-        int space_needed = cacheBlocks[block_index].dirty ?
-                        (cacheBlocks[block_index].hasConflict ? 2 : 1) :
-                        (cacheBlocks[block_index].hasConflict ? 1 : 0);
-        if (!allocateMemQueueSpace(space_needed)) {
-            DPRINTF(CoalesceEngine,  "%s: There is not enough space in memReqQueue to "
-                    "procees the eviction of cache line [%d]. dirty: %d, "
-                    "hasConflict: %d.\n", __func__, block_index,
-                    cacheBlocks[block_index].dirty,
-                    cacheBlocks[block_index].hasConflict);
-            requestMemRetry(space_needed);
-            return;
-        } else {
-            if (cacheBlocks[block_index].dirty) {
-                DPRINTF(CoalesceEngine,  "%s: Change observed on cache line [%d].\n",
-                            __func__, block_index);
-                PacketPtr write_pkt = createWritePacket(
-                    cacheBlocks[block_index].addr, peerMemoryAtomSize,
-                    (uint8_t*) cacheBlocks[block_index].items);
-                DPRINTF(CoalesceEngine,  "%s: Created a write packet to Addr: %lu, "
-                            "size = %d.\n", __func__,
-                            write_pkt->getAddr(), write_pkt->getSize());
-                enqueueMemReq(write_pkt);
-            }
-
-            if (cacheBlocks[block_index].hasConflict) {
-                assert(!MSHRMap[block_index].empty());
-                Addr miss_addr = MSHRMap[block_index].front();
-                DPRINTF(CoalesceEngine,  "%s: First conflicting address for cache line[%d]"
-                        " is Addr: %lu.\n", __func__, block_index, miss_addr);
-
-                Addr aligned_miss_addr =
-                    roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
-
-                PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
-                                                        peerMemoryAtomSize);
-                DPRINTF(CoalesceEngine,  "%s: Created a read packet for Addr: %lu."
-                            " req addr (aligned_addr) = %lu, size = %d.\n",
-                            __func__, miss_addr,
-                            read_pkt->getAddr(), read_pkt->getSize());
-                enqueueMemReq(read_pkt);
-
-                cacheBlocks[block_index].addr = aligned_miss_addr;
-                cacheBlocks[block_index].busyMask = 0;
-                cacheBlocks[block_index].allocated = true;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = true;
-                cacheBlocks[block_index].dirty = false;
-                DPRINTF(CoalesceEngine,  "%s: Allocated cache line [%d] for Addr: %lu.\n",
-                            __func__, block_index, aligned_miss_addr);
-            } else {
-
-                // Since allocated is false, does not matter what the address is.
-                cacheBlocks[block_index].busyMask = 0;
-                cacheBlocks[block_index].allocated = false;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].dirty = false;
-                DPRINTF(CoalesceEngine,  "%s: Deallocated cache line [%d].\n",
-                            __func__, block_index);
-            }
+        if (cacheBlocks[block_index].dirty) {
+            DPRINTF(CoalesceEngine,  "%s: Change observed on "
+                    "cacheBlocks[%d].\n", __func__, block_index);
+            PacketPtr write_pkt = createWritePacket(
+                cacheBlocks[block_index].addr, peerMemoryAtomSize,
+                (uint8_t*) cacheBlocks[block_index].items);
+            DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
+                        "Addr: %lu, size = %d.\n", __func__,
+                        write_pkt->getAddr(), write_pkt->getSize());
+            enqueueMemReq(write_pkt);
         }
+        assert(!MSHR[block_index].empty());
+        Addr miss_addr = MSHR[block_index].front();
+        DPRINTF(CoalesceEngine,  "%s: First conflicting address for "
+                                    "cacheBlocks[%d] is Addr: %lu.\n",
+                                    __func__, block_index, miss_addr);
+        Addr aligned_miss_addr =
+            roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
+
+        cacheBlocks[block_index].addr = aligned_miss_addr;
+        cacheBlocks[block_index].busyMask = 0;
+        cacheBlocks[block_index].allocated = true;
+        cacheBlocks[block_index].valid = false;
+        cacheBlocks[block_index].hasConflict = true;
+        cacheBlocks[block_index].dirty = false;
+        DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for "
+                "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
+        fillQueue.push_back(block_index);
     }
 
-    evictQueue.pop_front();
+    writeBackQueue.pop_front();
 
-    if ((!evictQueue.empty()) &&
-        (!nextEvictEvent.scheduled())) {
-        schedule(nextEvictEvent, nextCycle());
+    if (!writeBackQueue.empty()) {
+        assert(!nextWriteBackEvent.pending());
+        assert(!nextWriteBackEvent.scheduled());
+        schedule(nextWriteBackEvent, nextCycle());
     }
 }
 
@@ -817,10 +798,11 @@ CoalesceEngine::processNextSendRetryEvent()
                 deschedule(nextApplyEvent);
             }
             if (cacheBlocks[block_index].hasConflict) {
-                evictQueue.push_back(block_index);
-                if ((!nextEvictEvent.scheduled()) &&
-                    (!pendingMemRetry())) {
-                    schedule(nextEvictEvent, nextCycle());
+                writeBackQueue.push_back(block_index);
+                if ((!writeBackQueue.empty()) &&
+                    (!nextWriteBackEvent.pending()) &&
+                    (!nextWriteBackEvent.scheduled())) {
+                    schedule(nextWriteBackEvent, nextCycle());
                 }
             }
         }
@@ -829,6 +811,8 @@ CoalesceEngine::processNextSendRetryEvent()
         // handle memory retries correctly. This probably requires scheduling
         // an event for sending the retry. For now we're enabling infinite
         // queueing in the outstandingMemReqQueue.
+        // FIXME: Also do not send requests for cache lines that are already
+        // read but await data. Just set a flag or sth.
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
         SenderState* sender_state = new SenderState(true);
         pkt->pushSenderState(sender_state);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 05fa555ec8..563fa671b3 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -104,25 +104,28 @@ class CoalesceEngine : public BaseMemEngine
     int numLines;
     int numElementsPerLine;
 
-    int numMSHREntry;
+    int numMSHREntries;
     int numTgtsPerMSHR;
-    std::unordered_map<int, std::vector<Addr>> MSHRMap;
-    std::deque<PacketPtr> lineFillBuffer;
+    std::unordered_map<int, std::vector<Addr>> MSHR;
+
+    std::deque<int> fillQueue;
+
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     int currentBitSliceIndex;
     int numRetriesReceived;
-    FIFOSet<int> applyQueue;
+    InOutSet<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
-    FIFOSet<int> evictQueue;
+    InOutSet<int> writeBackQueue;
+    InOutSet<int> replaceQueue;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
 
-    MemoryEvent nextReadOnMissEvent;
-    void processNextReadOnMissEvent();
+    MemoryEvent nextMemoryReadEvent;
+    void processNextMemoryReadEvent();
 
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
@@ -130,8 +133,8 @@ class CoalesceEngine : public BaseMemEngine
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
-    MemoryEvent nextEvictEvent;
-    void processNextEvictEvent();
+    MemoryEvent nextWriteBackEvent;
+    void processNextWriteBackEvent();
 
     EventFunctionWrapper nextSendRetryEvent;
     void processNextSendRetryEvent();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index ab2962b253..5ab8db401c 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -117,7 +117,7 @@ PushEngine::deallocatePushSpace(int space)
             (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
         DPRINTF(PushEngine, "%s: pushReqQueue has at least %d "
                             "free spaces.\n", __func__, free_space);
-        if ((free_space > numElementsPerLine) &&
+        if ((free_space >= numElementsPerLine) &&
             (numPendingRetries == 0)) {
             DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
@@ -217,7 +217,7 @@ PushEngine::processNextAddrGenEvent()
             (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
             DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
                         "free spaces.\n", __func__, free_space);
-            if ((free_space > numElementsPerLine) &&
+            if ((free_space >= numElementsPerLine) &&
                 (numPendingRetries == 0)) {
                 DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);

From e0f5242c06f12b799b76455d0b95ba90e6238e74 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 23 Jul 2022 23:57:58 -0700
Subject: [PATCH 123/247] Implemented MemoryEvent retry mechanism.

---
 configs/accl/sega.py                   |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 87 ++++++++++++++++++++------
 src/accl/graph/sega/coalesce_engine.hh |  5 +-
 src/accl/graph/sega/push_engine.cc     | 17 +++--
 src/accl/graph/sega/push_engine.hh     |  3 +
 5 files changed, 88 insertions(+), 26 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index eb209911be..ffd74241e7 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -19,7 +19,7 @@ def __init__(self, base_edge_addr):
                                     cache_size="1MiB",
                                     num_mshr_entry=1,
                                     num_tgts_per_mshr=1,
-                                    outstanding_mem_req_queue_size=0)
+                                    outstanding_mem_req_queue_size=1)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=1,
                                 on_the_fly_update_map_size=1)
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index ddbd22a8b5..4a0600e9c0 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -300,10 +300,16 @@ CoalesceEngine::recvWLRead(Addr addr)
 void
 CoalesceEngine::processNextMemoryReadEvent()
 {
+    assert(!nextMemoryReadEvent.pending());
     if (memQueueFull()) {
-        nextMemoryReadEvent.sleep();
         // TODO: Implement interface where events of the CoalesceEngine are
         // pushed to a fifo to be scheduled later.
+        nextMemoryReadEvent.sleep();
+        if (!pendingMemRetry()) {
+            assert(pendingEventQueue.empty());
+            requestMemRetry(1);
+        }
+        pendingEventQueue.push_back("nextMemoryReadEvent");
         return;
     }
 
@@ -351,8 +357,33 @@ CoalesceEngine::processNextRespondEvent()
 void
 CoalesceEngine::recvMemRetry()
 {
-    // assert(!nextEvictEvent.scheduled());
-    // schedule(nextEvictEvent, nextCycle());
+    assert(!pendingEventQueue.empty());
+    std::string front = pendingEventQueue.front();
+
+    if (front == "nextMemoryReadEvent") {
+        assert(!nextMemoryReadEvent.scheduled());
+        assert(nextMemoryReadEvent.pending());
+        schedule(nextMemoryReadEvent, nextCycle());
+        nextMemoryReadEvent.wake();
+    } else if (front == "nextWriteBackEvent") {
+        assert(!nextWriteBackEvent.scheduled());
+        assert(nextWriteBackEvent.pending());
+        schedule(nextWriteBackEvent, nextCycle());
+        nextWriteBackEvent.wake();
+    } else if (front == "nextSendRetryEvent") {
+        assert(!nextSendRetryEvent.scheduled());
+        assert(nextSendRetryEvent.pending());
+        breakPointFunction();
+        schedule(nextSendRetryEvent, nextCycle());
+        nextSendRetryEvent.wake();
+    } else {
+        panic("EVENT IS NOT RECOGNIZED.\n");
+    }
+
+    pendingEventQueue.pop_front();
+    if (!pendingEventQueue.empty()) {
+        requestMemRetry(1);
+    }
     return;
 }
 
@@ -652,10 +683,16 @@ CoalesceEngine::processNextApplyEvent()
 void
 CoalesceEngine::processNextWriteBackEvent()
 {
+    assert(!nextWriteBackEvent.pending());
     if (memQueueFull()) {
         nextWriteBackEvent.sleep();
         // TODO: Implement interface where events of the CoalesceEngine are
         // pushed to a fifo to be scheduled later.
+        if (!pendingMemRetry()) {
+            assert(pendingEventQueue.empty());
+            requestMemRetry(1);
+        }
+        pendingEventQueue.push_back("nextWriteBackEvent");
         return;
     }
 
@@ -715,20 +752,25 @@ void
 CoalesceEngine::recvPushRetry()
 {
     numRetriesReceived++;
-    if (!nextSendRetryEvent.scheduled()) {
-        schedule(nextSendRetryEvent, nextCycle());
-    }
+    // For now since we do only one retry at a time, we should not receive
+    // a retry while this nextSendingRetryEvent is scheduled or is pending.
+    assert(!nextSendRetryEvent.pending());
+    assert(!nextSendRetryEvent.scheduled());
+    assert(numRetriesReceived == 1);
+    schedule(nextSendRetryEvent, nextCycle());
 }
 
 void
 CoalesceEngine::processNextSendRetryEvent()
 {
-    if (needsPush.count() == 0) {
-        DPRINTF(CoalesceEngine, "%s: Received a retry while there are no set "
-                        "bit in needsPush. Rejecting the retry.\n", __func__);
-        peerPushEngine->recvRetryReject();
-        return;
-    }
+    assert(!nextSendRetryEvent.pending());
+    assert(needsPush.count() != 0);
+    // if (needsPush.count() == 0) {
+    //     DPRINTF(CoalesceEngine, "%s: Received a retry while there are no set "
+    //                     "bit in needsPush. Rejecting the retry.\n", __func__);
+    //     peerPushEngine->recvRetryReject();
+    //     return;
+    // }
 
     DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     Addr block_addr = 0;
@@ -807,6 +849,16 @@ CoalesceEngine::processNextSendRetryEvent()
             }
         }
     } else {
+        if (memQueueFull()) {
+            nextSendRetryEvent.sleep();
+            if (!pendingMemRetry()) {
+                assert(pendingEventQueue.empty());
+                requestMemRetry(1);
+            }
+            pendingEventQueue.push_back("nextSendRetryEvent");
+            return;
+        }
+
         // FIXME: Fix the retry mechanism between memory and cache to
         // handle memory retries correctly. This probably requires scheduling
         // an event for sending the retry. For now we're enabling infinite
@@ -816,17 +868,12 @@ CoalesceEngine::processNextSendRetryEvent()
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
         SenderState* sender_state = new SenderState(true);
         pkt->pushSenderState(sender_state);
-        if (allocateMemQueueSpace(1)) {
-            enqueueMemReq(pkt);
-        } else {
-            requestMemRetry(1);
-        }
+        enqueueMemReq(pkt);
     }
 
     numRetriesReceived--;
-    if ((numRetriesReceived > 0) && (!nextSendRetryEvent.scheduled())) {
-        schedule(nextSendRetryEvent, nextCycle());
-    }
+    assert(numRetriesReceived == 0);
+    assert(!nextSendRetryEvent.scheduled());
 }
 
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 563fa671b3..83ca6e5f14 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -124,6 +124,8 @@ class CoalesceEngine : public BaseMemEngine
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
 
+    std::deque<std::string> pendingEventQueue;
+
     MemoryEvent nextMemoryReadEvent;
     void processNextMemoryReadEvent();
 
@@ -136,7 +138,7 @@ class CoalesceEngine : public BaseMemEngine
     MemoryEvent nextWriteBackEvent;
     void processNextWriteBackEvent();
 
-    EventFunctionWrapper nextSendRetryEvent;
+    MemoryEvent nextSendRetryEvent;
     void processNextSendRetryEvent();
 
     struct CoalesceStats : public statistics::Group
@@ -159,6 +161,7 @@ class CoalesceEngine : public BaseMemEngine
 
     CoalesceStats stats;
 
+    void breakPointFunction() { std::cout << "Salaam." << std::endl; }
   protected:
     virtual int respBuffSize() { return -1; }
     virtual void recvMemRetry();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 5ab8db401c..c64ff003c4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -43,6 +43,7 @@ PushEngine::PushEngine(const PushEngineParams &params):
     numTotalRetries(0), numPendingRetries(0),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
+    nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
     stats(*this)
 {}
 
@@ -121,8 +122,8 @@ PushEngine::deallocatePushSpace(int space)
             (numPendingRetries == 0)) {
             DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
-            numPendingRetries++;
-            peerCoalesceEngine->recvPushRetry();
+            assert(!nextSendRetryEvent.scheduled());
+            schedule(nextSendRetryEvent, nextCycle());
         }
     }
 }
@@ -221,8 +222,8 @@ PushEngine::processNextAddrGenEvent()
                 (numPendingRetries == 0)) {
                 DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
-                numPendingRetries++;
-                peerCoalesceEngine->recvPushRetry();
+                assert(!nextSendRetryEvent.scheduled());
+                schedule(nextSendRetryEvent, nextCycle());
             }
         }
     }
@@ -239,6 +240,14 @@ PushEngine::processNextAddrGenEvent()
     }
 }
 
+void
+PushEngine::processNextSendRetryEvent()
+{
+    assert(numPendingRetries == 0);
+    numPendingRetries++;
+    peerCoalesceEngine->recvPushRetry();
+}
+
 void
 PushEngine::recvMemRetry()
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index a3a308554f..378cd1a487 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -128,6 +128,9 @@ class PushEngine : public BaseMemEngine
     EventFunctionWrapper nextPushEvent;
     void processNextPushEvent();
 
+    EventFunctionWrapper nextSendRetryEvent;
+    void processNextSendRetryEvent();
+
     struct PushStats : public statistics::Group
     {
       PushStats(PushEngine &push);

From 42ff3b88231d9f69c4f0fcb7ccbddfc2db66d799 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 24 Jul 2022 17:43:22 -0700
Subject: [PATCH 124/247] Adding DPRINTF for structure sizes.

---
 src/accl/graph/SConscript              |   2 +-
 src/accl/graph/base/base_mem_engine.cc |  47 +++++---
 src/accl/graph/base/base_mem_engine.hh |   4 +-
 src/accl/graph/sega/coalesce_engine.cc |   2 +-
 src/accl/graph/sega/push_engine.cc     |   5 +-
 src/accl/graph/sega/wl_engine.cc       | 151 +++++++++++++++----------
 src/accl/graph/sega/wl_engine.hh       |   8 +-
 7 files changed, 134 insertions(+), 85 deletions(-)

diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index f5f7e962af..7fd3591b2c 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -27,5 +27,5 @@
 
 Import('*')
 
-
+DebugFlag('SEGAStructureSize')
 CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine', 'BaseMemEngine'])
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index cb4c1d81bb..aa78aac8b5 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/base/base_mem_engine.hh"
 
 #include "debug/BaseMemEngine.hh"
+#include "debug/SEGAStructureSize.hh"
 
 namespace gem5
 {
@@ -37,7 +38,7 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
-    outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
+    memQueueSize(params.outstanding_mem_req_queue_size),
     onTheFlyReqs(0),
     respQueueSize(params.resp_queue_size),
     memRetryRequested(false),
@@ -99,17 +100,22 @@ BaseMemEngine::processNextMemReqEvent()
 {
     if ((respQueueSize == 0) ||
         ((respBuffSize() + onTheFlyReqs) < respQueueSize)) {
-        PacketPtr pkt = outstandingMemReqQueue.front();
+        PacketPtr pkt = memQueue.front();
         memPort.sendPacket(pkt);
         onTheFlyReqs++;
         DPRINTF(BaseMemEngine, "%s: Sent a packet to memory with the following info. "
                     "pkt->addr: %lu, pkt->size: %lu.\n",
                     __func__, pkt->getAddr(), pkt->getSize());
-        outstandingMemReqQueue.pop_front();
-
+        memQueue.pop_front();
+        DPRINTF(SEGAStructureSize, "%s: Popped pkt: %s from "
+                "memQueue. memQueue.size = %d, memQueueSize = %d.\n",
+                __func__, pkt->print(), memQueue.size(), memQueueSize);
+        DPRINTF(BaseMemEngine, "%s: Popped pkt: %s from "
+                "memQueue. memQueue.size = %d, memQueueSize = %d.\n",
+                __func__, pkt->print(), memQueue.size(), memQueueSize);
         if (memRetryRequested &&
-            (outstandingMemReqQueue.size() <=
-            (outstandingMemReqQueueSize - memSpaceRequested))) {
+            (memQueue.size() <=
+            (memQueueSize - memSpaceRequested))) {
             memRetryRequested = false;
             memSpaceRequested = 0;
             recvMemRetry();
@@ -117,7 +123,7 @@ BaseMemEngine::processNextMemReqEvent()
     }
 
     if ((!memPort.blocked()) &&
-        (!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
+        (!memQueue.empty()) && (!nextMemReqEvent.scheduled())) {
         schedule(nextMemReqEvent, nextCycle());
     }
 }
@@ -156,30 +162,35 @@ BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 bool
 BaseMemEngine::allocateMemQueueSpace(int space)
 {
-    assert((outstandingMemReqQueueSize == 0) ||
-        (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
+    assert((memQueueSize == 0) ||
+        (memQueue.size() <= memQueueSize));
     return (
-        (outstandingMemReqQueueSize == 0) ||
-        (outstandingMemReqQueue.size() <= (outstandingMemReqQueueSize - space))
+        (memQueueSize == 0) ||
+        (memQueue.size() <= (memQueueSize - space))
         );
 }
 
 bool
 BaseMemEngine::memQueueFull()
 {
-    assert((outstandingMemReqQueueSize == 0) ||
-        (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
+    assert((memQueueSize == 0) ||
+        (memQueue.size() <= memQueueSize));
     return (
-        (outstandingMemReqQueueSize != 0) &&
-        (outstandingMemReqQueue.size() == outstandingMemReqQueueSize));
+        (memQueueSize != 0) &&
+        (memQueue.size() == memQueueSize));
 }
 
 void
 BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 {
     panic_if(memQueueFull(), "Should not enqueue if queue full.\n");
-    outstandingMemReqQueue.push_back(pkt);
-
+    memQueue.push_back(pkt);
+    DPRINTF(SEGAStructureSize, "%s: Pushed pkt: %s to memQueue. "
+                "memQueue.size = %d, memQueueSize = %d.\n", __func__,
+                pkt->print(), memQueue.size(), memQueueSize);
+    DPRINTF(BaseMemEngine, "%s: Pushed pkt: %s to memQueue. "
+                "memQueue.size = %d, memQueueSize = %d.\n", __func__,
+                pkt->print(), memQueue.size(), memQueueSize);
     if ((!nextMemReqEvent.scheduled()) && (!memPort.blocked())) {
         schedule(nextMemReqEvent, nextCycle());
     }
@@ -199,7 +210,7 @@ void
 BaseMemEngine::wakeUp()
 {
     assert(!nextMemReqEvent.scheduled());
-    if (!outstandingMemReqQueue.empty()) {
+    if (!memQueue.empty()) {
         schedule(nextMemReqEvent, nextCycle());
     }
 }
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
index 64ef49ee1d..520970c5a0 100644
--- a/src/accl/graph/base/base_mem_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -68,12 +68,12 @@ class BaseMemEngine : public ClockedObject
     System* system;
     MemPort memPort;
 
-    int outstandingMemReqQueueSize;
+    int memQueueSize;
     int onTheFlyReqs;
     int respQueueSize;
     bool memRetryRequested;
     int memSpaceRequested;
-    std::deque<PacketPtr> outstandingMemReqQueue;
+    std::deque<PacketPtr> memQueue;
 
     EventFunctionWrapper nextMemReqEvent;
     void processNextMemReqEvent();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 4a0600e9c0..ea572ea749 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -862,7 +862,7 @@ CoalesceEngine::processNextSendRetryEvent()
         // FIXME: Fix the retry mechanism between memory and cache to
         // handle memory retries correctly. This probably requires scheduling
         // an event for sending the retry. For now we're enabling infinite
-        // queueing in the outstandingMemReqQueue.
+        // queueing in the memQueue.
         // FIXME: Also do not send requests for cache lines that are already
         // read but await data. Just set a flag or sth.
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c64ff003c4..d745dabef6 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -222,8 +222,9 @@ PushEngine::processNextAddrGenEvent()
                 (numPendingRetries == 0)) {
                 DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
-                assert(!nextSendRetryEvent.scheduled());
-                schedule(nextSendRetryEvent, nextCycle());
+                if (!nextSendRetryEvent.scheduled()) {
+                    schedule(nextSendRetryEvent, nextCycle());
+                }
             }
         }
     }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 79bf046ba3..2d4ffc9cac 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -28,6 +28,7 @@
 
 #include "accl/graph/sega/wl_engine.hh"
 
+#include "debug/SEGAStructureSize.hh"
 #include "debug/WLEngine.hh"
 #include "mem/packet_access.hh"
 
@@ -39,7 +40,7 @@ WLEngine::WLEngine(const WLEngineParams &params):
     respPort(name() + ".resp_port", this),
     coalesceEngine(params.coalesce_engine),
     updateQueueSize(params.update_queue_size),
-    onTheFlyUpdateMapSize(params.on_the_fly_update_map_size),
+    registerFileSize(params.on_the_fly_update_map_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)
@@ -129,45 +130,68 @@ WLEngine::processNextReadEvent()
     uint32_t update_value;
     std::tie(update_addr, update_value) = updateQueue.front();
 
-    DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. Addr: %lu, "
-                "value: %u.\n", __func__, update_addr, update_value);
+    DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
+            "(addr: %lu, value: %u).\n", __func__, update_addr, update_value);
 
-    if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
-        DPRINTF(WLEngine,  "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
-                    __func__, update_addr);
-        if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
+    if ((registerFile.find(update_addr) == registerFile.end())) {
+        DPRINTF(WLEngine,  "%s: No register already allocated for addr: %lu "
+                            "in registerFile.\n", __func__, update_addr);
+        if (registerFile.size() < registerFileSize) {
+            DPRINTF(WLEngine, "%s: There are free registers available in the "
+                                            "registerFile.\n", __func__);
+            // TODO: It might be a good idea for WLEngine to act differently
+            // on cache rejects. As a first step the cache should not just
+            // return a boolean value. It should return an integer/enum
+            // to tell WLEngine why it rejected the read request. Their might
+            // be things that WLEngine can do to fix head of the line blocking.
             if (coalesceEngine->recvWLRead(update_addr)) {
-                onTheFlyUpdateMap[update_addr] = update_value;
-                DPRINTF(WLEngine,  "%s: Added a new item to onTheFlyUpdateMap. "
-                            "onTheFlyUpdateMap[%lu] = %u.\n", __func__,
-                            update_addr, onTheFlyUpdateMap[update_addr]);
+                DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read "
+                            "request to addr: %lu.\n", __func__, update_addr);
+                registerFile[update_addr] = update_value;
+                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, value: %u) "
+                        "to registerFile. registerFile.size = %d, "
+                        "registerFileSize = %d.\n", __func__, update_addr,
+                        update_value, registerFile.size(), registerFileSize);
+                DPRINTF(WLEngine, "%s: Added (addr: %lu, value: %u) "
+                        "to registerFile. registerFile.size = %d, "
+                        "registerFileSize = %d.\n", __func__, update_addr,
+                        update_value, registerFile.size(), registerFileSize);
                 updateQueue.pop_front();
-                DPRINTF(WLEngine,  "%s: Popped an item from the front of updateQueue"
-                            ". updateQueue.size = %u.\n",
-                            __func__, updateQueue.size());
+                DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
+                            "from updateQueue. updateQueue.size = %d. "
+                            "updateQueueSize = %d.\n", __func__, update_addr,
+                            update_value, updateQueue.size(), updateQueueSize);
+                DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
+                            "from updateQueue. updateQueue.size = %d. "
+                            "updateQueueSize = %d.\n", __func__, update_addr,
+                            update_value, updateQueue.size(), updateQueueSize);
                 respPort.checkRetryReq();
             }
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
-        DPRINTF(WLEngine,  "%s: Found the addr: %lu in onTheFlyUpdateMap. "
-                    "onTheFlyUpdateMap[%lu] = %u.\n", __func__, update_addr,
-                    update_addr, onTheFlyUpdateMap[update_addr]);
-        onTheFlyUpdateMap[update_addr] =
-                std::min(update_value, onTheFlyUpdateMap[update_addr]);
-        DPRINTF(WLEngine,  "%s: Reduced the update_value with the entry in "
-                    "onTheFlyUpdateMap. onTheFlyUpdateMap[%lu] = %u.\n",
-                    __func__, update_addr, onTheFlyUpdateMap[update_addr]);
-        stats.onTheFlyCoalesce++;
+        DPRINTF(WLEngine,  "%s: A register has already been allocated for "
+                    "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
+                __func__, update_addr, update_addr, registerFile[update_addr]);
+        registerFile[update_addr] =
+                std::min(update_value, registerFile[update_addr]);
+        DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
+                    " registerFile. registerFile[%lu] = %u.\n", __func__,
+                    update_value, update_addr, registerFile[update_addr]);
+        stats.registerFileCoalesce++;
         updateQueue.pop_front();
-        DPRINTF(WLEngine,  "%s: Popped an item from the front of updateQueue"
-                                        ". updateQueue.size = %u.\n",
-                                        __func__, updateQueue.size());
+        DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
+                            "from updateQueue. updateQueue.size = %d. "
+                            "updateQueueSize = %d.\n", __func__, update_addr,
+                            update_value, updateQueue.size(), updateQueueSize);
+        DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
+                    "from updateQueue. updateQueue.size = %d. "
+                    "updateQueueSize = %d.\n", __func__, update_addr,
+                    update_value, updateQueue.size(), updateQueueSize);
         respPort.checkRetryReq();
     }
 
-    // TODO: Only schedule nextReadEvent only when it has to be scheduled
-    if ((!nextReadEvent.scheduled()) && (!updateQueue.empty())) {
+    if (!updateQueue.empty() && (!nextReadEvent.scheduled())) {
         schedule(nextReadEvent, nextCycle());
     }
 }
@@ -175,14 +199,16 @@ WLEngine::processNextReadEvent()
 void
 WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 {
-    assert(addrWorkListMap.size() <= onTheFlyUpdateMapSize);
+    assert(workListFile.size() <= registerFileSize);
 
-    addrWorkListMap[addr] = wl;
-    DPRINTF(WLEngine,  "%s: Received a WorkListItem from the coalesceEngine. Adding"
-                " it to the addrWorkListMap. addrWorkListMap[%lu] = %s.\n",
-                __func__, addr, wl.to_string());
-
-    assert(!addrWorkListMap.empty());
+    workListFile[addr] = wl;
+    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) to "
+                "workListFile. workListFile.size = %d.\n", __func__, addr,
+                                    wl.to_string(), workListFile.size());
+    DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to "
+                "workListFile. workListFile.size = %d.\n", __func__, addr,
+                                    wl.to_string(), workListFile.size());
+    assert(!workListFile.empty());
     if (!nextReduceEvent.scheduled()) {
         schedule(nextReduceEvent, nextCycle());
     }
@@ -191,28 +217,31 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 void
 WLEngine::processNextReduceEvent()
 {
-    for (auto &it : addrWorkListMap) {
+    for (auto &it : workListFile) {
         Addr addr = it.first;
-        assert(onTheFlyUpdateMap.find(addr) != onTheFlyUpdateMap.end());
-        uint32_t update_value = onTheFlyUpdateMap[addr];
-        DPRINTF(WLEngine,  "%s: Reducing between onTheFlyUpdateMap and "
-                    "addrWorkListMap values. onTheFlyUpdateMap[%lu] = %u, "
-                    "addrWorkListMap[%lu] = %s.\n", __func__,
-                                addr, onTheFlyUpdateMap[addr],
-                                addr, addrWorkListMap[addr].to_string());
+        assert(registerFile.find(addr) != registerFile.end());
+        uint32_t update_value = registerFile[addr];
+        DPRINTF(WLEngine,  "%s: Reducing between registerFile and workListFile"
+                    ". registerFile[%lu] = %u, workListFile[%lu] = %s.\n",
+                                        __func__, addr, registerFile[addr],
+                                        addr, workListFile[addr].to_string());
         // TODO: Generalize this to reduce function rather than just min
-        addrWorkListMap[addr].tempProp =
-                    std::min(update_value, addrWorkListMap[addr].tempProp);
-        DPRINTF(WLEngine,  "%s: Reduction done. addrWorkListMap[%lu] = %s.\n",
-                    __func__, addr, addrWorkListMap[addr].to_string());
+        workListFile[addr].tempProp =
+                    std::min(update_value, workListFile[addr].tempProp);
+        DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
+                            __func__, addr, workListFile[addr].to_string());
         stats.numReduce++;
 
-        coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]);
-        onTheFlyUpdateMap.erase(addr);
-        DPRINTF(WLEngine,  "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
-                    __func__, addr);
+        coalesceEngine->recvWLWrite(addr, workListFile[addr]);
+        registerFile.erase(addr);
+        DPRINTF(SEGAStructureSize, "%s: Removed addr: %lu from registerFile. "
+                    "registerFile.size = %d, registerFileSize = %d\n",
+                    __func__, addr, registerFile.size(), registerFileSize);
+        DPRINTF(WLEngine, "%s: Removed addr: %lu from registerFile. "
+                    "registerFile.size = %d, registerFileSize = %d\n",
+                    __func__, addr, registerFile.size(), registerFileSize);
     }
-    addrWorkListMap.clear();
+    workListFile.clear();
 }
 
 bool
@@ -224,11 +253,19 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     }
 
     updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
-    DPRINTF(WLEngine,  "%s: Pushed an item to the back of updateQueue"
-                                        ". updateQueue.size = %u.\n",
-                                        __func__, updateQueue.size());
+    DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
+                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
+                updateQueue.size(), updateQueueSize);
+    DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the "
+                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
+                updateQueue.size(), updateQueueSize);
+
+
+    // delete the packet since it's not needed anymore.
     delete pkt;
-    assert(!updateQueue.empty());
+
     if (!nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());
     }
@@ -241,7 +278,7 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
 
     ADD_STAT(numReduce, statistics::units::Count::get(),
              "Number of memory blocks read for vertecies"),
-    ADD_STAT(onTheFlyCoalesce, statistics::units::Count::get(),
+    ADD_STAT(registerFileCoalesce, statistics::units::Count::get(),
              "Number of memory blocks read for vertecies")
 {
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 27fc3efa7a..79fe60f6d0 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -74,10 +74,10 @@ class WLEngine : public BaseReduceEngine
     int updateQueueSize;
     std::deque<std::tuple<Addr, uint32_t>> updateQueue;
 
-    int onTheFlyUpdateMapSize;
-    std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;
+    int registerFileSize;
+    std::unordered_map<Addr, uint32_t> registerFile;
 
-    std::unordered_map<Addr, WorkListItem> addrWorkListMap;
+    std::unordered_map<Addr, WorkListItem> workListFile;
 
     void recvFunctional(PacketPtr pkt);
 
@@ -98,7 +98,7 @@ class WLEngine : public BaseReduceEngine
       WLEngine &wl;
 
       statistics::Scalar numReduce;
-      statistics::Scalar onTheFlyCoalesce;
+      statistics::Scalar registerFileCoalesce;
     };
 
     WorkListStats stats;

From 5f513830921f24659a9e7fcb8aea10720a27840a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 24 Jul 2022 17:44:06 -0700
Subject: [PATCH 125/247] Updating config script for sega.

---
 configs/accl/sega.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index ffd74241e7..cf189733f0 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,20 +9,20 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=4,
+                                    push_req_queue_size=16,
                                     attached_memory_atom_size=64,
-                                    outstanding_mem_req_queue_size=1,
-                                    resp_queue_size=1)
+                                    outstanding_mem_req_queue_size=4,
+                                    resp_queue_size=8)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
                                     cache_size="1MiB",
-                                    num_mshr_entry=1,
-                                    num_tgts_per_mshr=1,
-                                    outstanding_mem_req_queue_size=1)
+                                    num_mshr_entry=8,
+                                    num_tgts_per_mshr=8,
+                                    outstanding_mem_req_queue_size=8)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=1,
-                                on_the_fly_update_map_size=1)
+                                update_queue_size=16,
+                                on_the_fly_update_map_size=8)
 
     def getRespPort(self):
         return self.wl_engine.resp_port

From ed206a8acdb86f3aa17df9e1d3d44e241385c67e Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 24 Jul 2022 18:14:08 -0700
Subject: [PATCH 126/247] Adding more assertion for MSHR and fillQueue.

---
 configs/accl/sega.py                   | 12 ++++++------
 src/accl/graph/sega/coalesce_engine.cc |  3 +++
 src/accl/graph/sega/push_engine.cc     |  2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index cf189733f0..8fb3b75996 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -16,13 +16,13 @@ def __init__(self, base_edge_addr):
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
-                                    cache_size="1MiB",
-                                    num_mshr_entry=8,
-                                    num_tgts_per_mshr=8,
-                                    outstanding_mem_req_queue_size=8)
+                                    cache_size="128B",
+                                    num_mshr_entry=1,
+                                    num_tgts_per_mshr=1,
+                                    outstanding_mem_req_queue_size=0)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=16,
-                                on_the_fly_update_map_size=8)
+                                update_queue_size=1,
+                                on_the_fly_update_map_size=4)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index ea572ea749..8f56962a8c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -232,6 +232,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                     return true;
                 } else {
                     assert(!cacheBlocks[block_index].valid);
+                    assert(MSHR[block_index].size() == 0);
                     // MSHR available and no conflict
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
                                             "Allocating a cache line for it.\n"
@@ -251,6 +252,7 @@ CoalesceEngine::recvWLRead(Addr addr)
 
                     // enqueueMemReq(pkt);
                     fillQueue.push_back(block_index);
+                    assert(fillQueue.size() <= numLines);
                     // FIXME: Fix this DPRINTF
                     // DPRINTF(CoalesceEngine,  "%s: Pushed pkt index  "
                     //         "lineFillBuffer. lineFillBuffer.size = %d.\n",
@@ -737,6 +739,7 @@ CoalesceEngine::processNextWriteBackEvent()
         DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for "
                 "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
         fillQueue.push_back(block_index);
+        assert(fillQueue.size() <= numLines);
     }
 
     writeBackQueue.pop_front();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d745dabef6..a41ca8a778 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -217,7 +217,7 @@ PushEngine::processNextAddrGenEvent()
             int free_space = pushReqQueueSize -
             (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
             DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
-                        "free spaces.\n", __func__, free_space);
+                        " free spaces.\n", __func__, free_space);
             if ((free_space >= numElementsPerLine) &&
                 (numPendingRetries == 0)) {
                 DPRINTF(PushEngine, "%s: Sent a push retry to "

From cdfd9817d9a3908fc86b2ec1f95420524b953ea3 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 24 Jul 2022 18:27:10 -0700
Subject: [PATCH 127/247] Adding debug flags for responseQueue size.

---
 src/accl/graph/sega/coalesce_engine.cc | 41 +++++++++++++++++++-------
 src/accl/graph/sega/wl_engine.hh       |  2 ++
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 8f56962a8c..959bfa9743 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -32,6 +32,7 @@
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
 #include "debug/CoalesceEngine.hh"
+#include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -168,11 +169,18 @@ CoalesceEngine::recvWLRead(Addr addr)
         // the future.
         responseQueue.push_back(std::make_tuple(addr,
                     cacheBlocks[block_index].items[wl_offset]));
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
-            "to responseQueue. responseQueue.size = %d.\n",
-            __func__, addr, block_index, wl_offset,
-            cacheBlocks[block_index].items[wl_offset].to_string(),
-            responseQueue.size());
+        DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d, "
+                        "responseQueueSize = %d.\n", __func__, addr,
+                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        responseQueue.size(),
+                        peerWLEngine->getRegisterFileSize());
+        DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d, "
+                        "responseQueueSize = %d.\n", __func__, addr,
+                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        responseQueue.size(),
+                        peerWLEngine->getRegisterFileSize());
         // TODO: Add a stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         stats.readHits++;
@@ -345,9 +353,12 @@ CoalesceEngine::processNextRespondEvent()
                 __func__, worklist_response.to_string(), addr_response);
 
     responseQueue.pop_front();
+    DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
+                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
+                responseQueue.size(), peerWLEngine->getRegisterFileSize());
     DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d.\n", __func__,
-                responseQueue.size());
+                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
+                responseQueue.size(), peerWLEngine->getRegisterFileSize());
 
     if ((!nextRespondEvent.scheduled()) &&
         (!responseQueue.empty())) {
@@ -536,10 +547,18 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
-            DPRINTF(CoalesceEngine,  "%s: Pushed cacheBlocks[%d][%d] to "
-                    "responseQueue. responseQueue.size = %u.\n"
-                    , __func__, block_index, wl_offset,
-                    responseQueue.size());
+            DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d, "
+                        "responseQueueSize = %d.\n", __func__, miss_addr,
+                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        responseQueue.size(), 
+                        peerWLEngine->getRegisterFileSize());
+            DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d, "
+                        "responseQueueSize = %d.\n", __func__, addr,
+                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        responseQueue.size(),
+                        peerWLEngine->getRegisterFileSize());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
             // End of the said block
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 79fe60f6d0..5e8e5b25f3 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -114,6 +114,8 @@ class WLEngine : public BaseReduceEngine
     bool handleIncomingUpdate(PacketPtr pkt);
 
     void handleIncomingWL(Addr addr, WorkListItem wl);
+
+    int getRegisterFileSize() { return registerFileSize; }
 };
 
 }

From 4a466aec9457f93be6bfa689489c8376c08d31c6 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 24 Jul 2022 18:33:53 -0700
Subject: [PATCH 128/247] Adding assertions to test the size of queues in
 coalesce engine.

---
 src/accl/graph/sega/coalesce_engine.cc | 10 +++++++++-
 src/accl/graph/sega/coalesce_engine.hh |  1 -
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 959bfa9743..753bfc988b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -49,7 +49,6 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numRetriesReceived(0),
     applyQueue(numLines),
     writeBackQueue(numLines),
-    replaceQueue(numLines),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
@@ -320,6 +319,8 @@ CoalesceEngine::processNextMemoryReadEvent()
             requestMemRetry(1);
         }
         pendingEventQueue.push_back("nextMemoryReadEvent");
+        // Maximum three MemoryEvent.
+        assert(pendingEventQueue.size() <= 3);
         return;
     }
 
@@ -460,6 +461,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     }
                     if (cacheBlocks[block_index].hasConflict) {
                         writeBackQueue.push_back(block_index);
+                        assert(writeBackQueue.size() <= numLines);
                         if ((!nextWriteBackEvent.pending()) &&
                             (!nextWriteBackEvent.scheduled())) {
                             schedule(nextWriteBackEvent, nextCycle());
@@ -683,6 +685,7 @@ CoalesceEngine::processNextApplyEvent()
     // TODO: This is where eviction policy goes
     if (cacheBlocks[block_index].hasConflict){
         writeBackQueue.push_back(block_index);
+        assert(writeBackQueue.size() <= numLines);
         DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. writeBackQueue.size = %u.\n",
                 __func__, block_index, writeBackQueue.size());
     }
@@ -714,6 +717,8 @@ CoalesceEngine::processNextWriteBackEvent()
             requestMemRetry(1);
         }
         pendingEventQueue.push_back("nextWriteBackEvent");
+        // Maximum three MemoryEvent.
+        assert(pendingEventQueue.size() <= 3);
         return;
     }
 
@@ -863,6 +868,7 @@ CoalesceEngine::processNextSendRetryEvent()
             }
             if (cacheBlocks[block_index].hasConflict) {
                 writeBackQueue.push_back(block_index);
+                assert(writeBackQueue.size() <= numLines);
                 if ((!writeBackQueue.empty()) &&
                     (!nextWriteBackEvent.pending()) &&
                     (!nextWriteBackEvent.scheduled())) {
@@ -878,6 +884,8 @@ CoalesceEngine::processNextSendRetryEvent()
                 requestMemRetry(1);
             }
             pendingEventQueue.push_back("nextSendRetryEvent");
+            // Maximum three MemoryEvent.
+            assert(pendingEventQueue.size() <= 3);
             return;
         }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 83ca6e5f14..cfa0a79102 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -118,7 +118,6 @@ class CoalesceEngine : public BaseMemEngine
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
     InOutSet<int> writeBackQueue;
-    InOutSet<int> replaceQueue;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);

From 48711528ef72651cccb68b08303159ce8b3fc071 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 24 Jul 2022 22:43:28 -0700
Subject: [PATCH 129/247] Checking the size of queues in PushEngine and
 WLEngine

---
 src/accl/graph/base/base_mem_engine.cc | 2 +-
 src/accl/graph/base/base_mem_engine.hh | 3 ++-
 src/accl/graph/sega/push_engine.cc     | 2 ++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index aa78aac8b5..590307b2bc 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -40,10 +40,10 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     memPort(name() + ".mem_port", this),
     memQueueSize(params.outstanding_mem_req_queue_size),
     onTheFlyReqs(0),
-    respQueueSize(params.resp_queue_size),
     memRetryRequested(false),
     memSpaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
+    respQueueSize(params.resp_queue_size),
     _requestorId(system->getRequestorId(this)),
     peerMemoryAtomSize(params.attached_memory_atom_size)
 {}
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
index 520970c5a0..01c862d555 100644
--- a/src/accl/graph/base/base_mem_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -70,7 +70,6 @@ class BaseMemEngine : public ClockedObject
 
     int memQueueSize;
     int onTheFlyReqs;
-    int respQueueSize;
     bool memRetryRequested;
     int memSpaceRequested;
     std::deque<PacketPtr> memQueue;
@@ -79,6 +78,8 @@ class BaseMemEngine : public ClockedObject
     void processNextMemReqEvent();
 
   protected:
+
+    int respQueueSize;
     const RequestorID _requestorId;
 
     size_t peerMemoryAtomSize;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a41ca8a778..cfebf8e5df 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -173,6 +173,7 @@ PushEngine::recvWLItemRetry(WorkListItem wl)
 
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                                     peerMemoryAtomSize, value);
+    assert(pushReqQueue.size() <= pushReqQueueSize);
     DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
                             __func__, pushReqQueue.size());
 
@@ -263,6 +264,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
     // TODO: in case we need to edit edges, get rid of second statement.
     assert(pkt->isResponse() && (!pkt->isWrite()));
     memRespQueue.push_back(pkt);
+    assert(memRespQueue.size() <= respQueueSize);
 
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
         schedule(nextPushEvent, nextCycle());

From 29ae1de4908cf215a44bcd8c9db9091c8306cf1b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 25 Jul 2022 09:33:11 -0700
Subject: [PATCH 130/247] Making CoalesceEngine a BaseMemoryEngine.

---
 configs/accl/sega.py                      |  13 ++-
 src/accl/graph/sega/BaseMemoryEngine.py   |  42 ++++++++
 src/accl/graph/sega/CoalesceEngine.py     |  17 ++-
 src/accl/graph/sega/SConscript            |   3 +
 src/accl/graph/sega/base_memory_engine.cc | 122 ++++++++++++++++++++++
 src/accl/graph/sega/base_memory_engine.hh |  99 ++++++++++++++++++
 src/accl/graph/sega/coalesce_engine.cc    |  70 ++++---------
 src/accl/graph/sega/coalesce_engine.hh    |  14 +--
 8 files changed, 305 insertions(+), 75 deletions(-)
 create mode 100644 src/accl/graph/sega/BaseMemoryEngine.py
 create mode 100644 src/accl/graph/sega/base_memory_engine.cc
 create mode 100644 src/accl/graph/sega/base_memory_engine.hh

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8fb3b75996..7577331f2b 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,20 +9,19 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=16,
+                                    push_req_queue_size=2,
                                     attached_memory_atom_size=64,
-                                    outstanding_mem_req_queue_size=4,
-                                    resp_queue_size=8)
+                                    outstanding_mem_req_queue_size=1,
+                                    resp_queue_size=1)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
-                                    cache_size="128B",
+                                    cache_size="32B",
                                     num_mshr_entry=1,
-                                    num_tgts_per_mshr=1,
-                                    outstanding_mem_req_queue_size=0)
+                                    num_tgts_per_mshr=1)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=1,
-                                on_the_fly_update_map_size=4)
+                                on_the_fly_update_map_size=1)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
diff --git a/src/accl/graph/sega/BaseMemoryEngine.py b/src/accl/graph/sega/BaseMemoryEngine.py
new file mode 100644
index 0000000000..10d8b708f0
--- /dev/null
+++ b/src/accl/graph/sega/BaseMemoryEngine.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseMemoryEngine(ClockedObject):
+    abstract = True
+    type = 'BaseMemoryEngine'
+    cxx_header = "accl/graph/sega/base_memory_engine.hh"
+    cxx_class = 'gem5::BaseMemoryEngine'
+
+    system = Param.System(Parent.any, 'System this Engine is a part of')
+    mem_port  = RequestPort("Port to communicate with the memory")
+
+    attached_memory_atom_size = Param.Int(64, "The atom size of the attached "
+                                    "memory.")
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 7667a22c5a..536c3477ae 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -27,21 +27,16 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.BaseMemEngine import BaseMemEngine
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
 
-class CoalesceEngine(BaseMemEngine):
+class CoalesceEngine(BaseMemoryEngine):
     type = 'CoalesceEngine'
     cxx_header = "accl/graph/sega/coalesce_engine.hh"
     cxx_class = 'gem5::CoalesceEngine'
 
-    peer_push_engine = Param.PushEngine(NULL, "")
-
-    cache_size = Param.MemorySize("16KiB", "Size of the internal cache.")
-
-    num_mshr_entry = Param.Int(4, "")
-    num_tgts_per_mshr = Param.Int(20, "")
-
-    # Don't change. If changed. It will break functionality of coalesce.
-    resp_queue_size = 0
+    peer_push_engine = Param.PushEngine(NULL, "PushEngine in the same GPT.")
 
+    cache_size = Param.MemorySize("16KiB", "Size of the internal SRAM array.")
 
+    num_mshr_entry = Param.Int(4, "Number of MSHR entries.")
+    num_tgts_per_mshr = Param.Int(20, "Number of Targets Per MSHR.")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 77e508f4ed..97a62d44a0 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -27,16 +27,19 @@
 
 Import('*')
 
+SimObject('BaseMemoryEngine.py')
 SimObject('CenteralController.py')
 SimObject('CoalesceEngine.py')
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
+Source('base_memory_engine.cc')
 Source('centeral_controller.cc')
 Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
 
+DebugFlag('BaseMemoryEngine')
 DebugFlag('ApplyUpdates')
 DebugFlag('CenteralController')
 DebugFlag('CoalesceEngine')
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
new file mode 100644
index 0000000000..e5e78f2c04
--- /dev/null
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/base_memory_engine.hh"
+
+#include "debug/BaseMemoryEngine.hh"
+#include "debug/SEGAStructureSize.hh"
+
+namespace gem5
+{
+
+BaseMemoryEngine::BaseMemoryEngine(const BaseMemoryEngineParams &params):
+    ClockedObject(params),
+    system(params.system),
+    _requestorId(system->getRequestorId(this)),
+    memPort(name() + ".mem_port", this),
+    peerMemoryAtomSize(params.attached_memory_atom_size)
+{}
+
+BaseMemoryEngine::~BaseMemoryEngine()
+{}
+
+Port&
+BaseMemoryEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "mem_port") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+void
+BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    } else {
+        owner->recvMemRetry();
+    }
+}
+
+bool
+BaseMemoryEngine::MemPort::recvTimingResp(PacketPtr pkt)
+{
+    return owner->handleMemResp(pkt);
+}
+
+void
+BaseMemoryEngine::MemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket),
+            "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+PacketPtr
+BaseMemoryEngine::createReadPacket(Addr addr, unsigned int size)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
+
+PacketPtr
+BaseMemoryEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
+}
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
new file mode 100644
index 0000000000..8fb8fde7e6
--- /dev/null
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
+
+#include <unordered_map>
+
+#include "base/addr_range.hh"
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "params/BaseMemoryEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseMemoryEngine : public ClockedObject
+{
+  private:
+    class MemPort : public RequestPort
+    {
+      private:
+        BaseMemoryEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+        public:
+        MemPort(const std::string& name, BaseMemoryEngine* owner):
+            RequestPort(name, owner), owner(owner),
+            _blocked(false), blockedPacket(nullptr)
+        {}
+
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+        protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+  protected:
+    System* system;
+    const RequestorID _requestorId;
+
+    MemPort memPort;
+
+    size_t peerMemoryAtomSize;
+
+    virtual void recvMemRetry() = 0;
+    virtual bool handleMemResp(PacketPtr pkt) = 0;
+
+    PacketPtr createReadPacket(Addr addr, unsigned int size);
+    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
+
+  public:
+    PARAMS(BaseMemoryEngine);
+
+    BaseMemoryEngine(const Params &params);
+    ~BaseMemoryEngine();
+
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
+
+    AddrRangeList getAddrRanges() { return memPort.getAddrRanges(); }
+
+    void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 753bfc988b..678cf0456e 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -38,8 +38,8 @@
 namespace gem5
 {
 
-CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
-    BaseMemEngine(params),
+CoalesceEngine::CoalesceEngine(const Params &params):
+    BaseMemoryEngine(params),
     peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
@@ -67,12 +67,6 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     needsPush.reset();
 }
 
-void
-CoalesceEngine::recvFunctional(PacketPtr pkt)
-{
-    sendMemFunctional(pkt);
-}
-
 void
 CoalesceEngine::startup()
 {
@@ -171,13 +165,13 @@ CoalesceEngine::recvWLRead(Addr addr)
         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d, "
                         "responseQueueSize = %d.\n", __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        cacheBlocks[block_index].items[wl_offset].to_string(),
                         responseQueue.size(),
                         peerWLEngine->getRegisterFileSize());
         DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d, "
                         "responseQueueSize = %d.\n", __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        cacheBlocks[block_index].items[wl_offset].to_string(),
                         responseQueue.size(),
                         peerWLEngine->getRegisterFileSize());
         // TODO: Add a stat to count the number of WLItems that have been touched.
@@ -257,7 +251,6 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
 
-                    // enqueueMemReq(pkt);
                     fillQueue.push_back(block_index);
                     assert(fillQueue.size() <= numLines);
                     // FIXME: Fix this DPRINTF
@@ -310,16 +303,12 @@ void
 CoalesceEngine::processNextMemoryReadEvent()
 {
     assert(!nextMemoryReadEvent.pending());
-    if (memQueueFull()) {
+    if (memPort.blocked()) {
         // TODO: Implement interface where events of the CoalesceEngine are
         // pushed to a fifo to be scheduled later.
         nextMemoryReadEvent.sleep();
-        if (!pendingMemRetry()) {
-            assert(pendingEventQueue.empty());
-            requestMemRetry(1);
-        }
         pendingEventQueue.push_back("nextMemoryReadEvent");
-        // Maximum three MemoryEvent.
+        // Maximum three MemoryEvents.
         assert(pendingEventQueue.size() <= 3);
         return;
     }
@@ -330,7 +319,7 @@ CoalesceEngine::processNextMemoryReadEvent()
     DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
             "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
 
-    enqueueMemReq(pkt);
+    memPort.sendPacket(pkt);
 
     fillQueue.pop_front();
 
@@ -367,11 +356,13 @@ CoalesceEngine::processNextRespondEvent()
     }
 }
 
-// FIXME: Update this for implementing event retry interaction.
 void
 CoalesceEngine::recvMemRetry()
 {
-    assert(!pendingEventQueue.empty());
+    if (pendingEventQueue.empty()) {
+        return;
+    }
+
     std::string front = pendingEventQueue.front();
 
     if (front == "nextMemoryReadEvent") {
@@ -387,7 +378,6 @@ CoalesceEngine::recvMemRetry()
     } else if (front == "nextSendRetryEvent") {
         assert(!nextSendRetryEvent.scheduled());
         assert(nextSendRetryEvent.pending());
-        breakPointFunction();
         schedule(nextSendRetryEvent, nextCycle());
         nextSendRetryEvent.wake();
     } else {
@@ -395,12 +385,10 @@ CoalesceEngine::recvMemRetry()
     }
 
     pendingEventQueue.pop_front();
-    if (!pendingEventQueue.empty()) {
-        requestMemRetry(1);
-    }
     return;
 }
 
+// FIXME: Fix this function.
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
@@ -552,13 +540,13 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d, "
                         "responseQueueSize = %d.\n", __func__, miss_addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(), 
-                        responseQueue.size(), 
+                        cacheBlocks[block_index].items[wl_offset].to_string(),
+                        responseQueue.size(),
                         peerWLEngine->getRegisterFileSize());
             DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d, "
                         "responseQueueSize = %d.\n", __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        cacheBlocks[block_index].items[wl_offset].to_string(),
                         responseQueue.size(),
                         peerWLEngine->getRegisterFileSize());
             // TODO: Add a stat to count the number of WLItems that have been touched.
@@ -708,14 +696,8 @@ void
 CoalesceEngine::processNextWriteBackEvent()
 {
     assert(!nextWriteBackEvent.pending());
-    if (memQueueFull()) {
+    if (memPort.blocked()) {
         nextWriteBackEvent.sleep();
-        // TODO: Implement interface where events of the CoalesceEngine are
-        // pushed to a fifo to be scheduled later.
-        if (!pendingMemRetry()) {
-            assert(pendingEventQueue.empty());
-            requestMemRetry(1);
-        }
         pendingEventQueue.push_back("nextWriteBackEvent");
         // Maximum three MemoryEvent.
         assert(pendingEventQueue.size() <= 3);
@@ -744,7 +726,7 @@ CoalesceEngine::processNextWriteBackEvent()
             DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
                         "Addr: %lu, size = %d.\n", __func__,
                         write_pkt->getAddr(), write_pkt->getSize());
-            enqueueMemReq(write_pkt);
+            memPort.sendPacket(write_pkt);
         }
         assert(!MSHR[block_index].empty());
         Addr miss_addr = MSHR[block_index].front();
@@ -764,6 +746,10 @@ CoalesceEngine::processNextWriteBackEvent()
                 "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
         fillQueue.push_back(block_index);
         assert(fillQueue.size() <= numLines);
+        if ((!nextMemoryReadEvent.pending()) &&
+            (!nextMemoryReadEvent.scheduled())){
+            schedule(nextMemoryReadEvent, nextCycle());
+        }
     }
 
     writeBackQueue.pop_front();
@@ -792,12 +778,6 @@ CoalesceEngine::processNextSendRetryEvent()
 {
     assert(!nextSendRetryEvent.pending());
     assert(needsPush.count() != 0);
-    // if (needsPush.count() == 0) {
-    //     DPRINTF(CoalesceEngine, "%s: Received a retry while there are no set "
-    //                     "bit in needsPush. Rejecting the retry.\n", __func__);
-    //     peerPushEngine->recvRetryReject();
-    //     return;
-    // }
 
     DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     Addr block_addr = 0;
@@ -877,12 +857,8 @@ CoalesceEngine::processNextSendRetryEvent()
             }
         }
     } else {
-        if (memQueueFull()) {
+        if (memPort.blocked()) {
             nextSendRetryEvent.sleep();
-            if (!pendingMemRetry()) {
-                assert(pendingEventQueue.empty());
-                requestMemRetry(1);
-            }
             pendingEventQueue.push_back("nextSendRetryEvent");
             // Maximum three MemoryEvent.
             assert(pendingEventQueue.size() <= 3);
@@ -898,7 +874,7 @@ CoalesceEngine::processNextSendRetryEvent()
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
         SenderState* sender_state = new SenderState(true);
         pkt->pushSenderState(sender_state);
-        enqueueMemReq(pkt);
+        memPort.sendPacket(pkt);
     }
 
     numRetriesReceived--;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index cfa0a79102..a322379b05 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -31,7 +31,7 @@
 
 #include <bitset>
 
-#include "accl/graph/base/base_mem_engine.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "base/statistics.hh"
@@ -39,14 +39,12 @@
 
 #define MAX_BITVECTOR_SIZE (1 << 30)
 
-// TODO: Add parameters for size, memory atom size, type size,
-// length of items in the blocks.
 namespace gem5
 {
 
 class WLEngine;
 
-class CoalesceEngine : public BaseMemEngine
+class CoalesceEngine : public BaseMemoryEngine
 {
   private:
     class MemoryEvent : public EventFunctionWrapper
@@ -160,16 +158,14 @@ class CoalesceEngine : public BaseMemEngine
 
     CoalesceStats stats;
 
-    void breakPointFunction() { std::cout << "Salaam." << std::endl; }
   protected:
-    virtual int respBuffSize() { return -1; }
     virtual void recvMemRetry();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
     PARAMS(CoalesceEngine);
 
-    CoalesceEngine(const CoalesceEngineParams &params);
+    CoalesceEngine(const Params &params);
 
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
@@ -178,9 +174,7 @@ class CoalesceEngine : public BaseMemEngine
 
     void recvPushRetry();
 
-    void recvFunctional(PacketPtr pkt);
-
-    virtual void startup();
+    virtual void startup() override;
 };
 
 }

From bbc7e3afbea04fd283157f89d024f4f9b9c2d78d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 25 Jul 2022 13:06:22 -0700
Subject: [PATCH 131/247] Fixing cache mapping issue.

---
 src/accl/graph/SConscript                 |   3 +-
 src/accl/graph/sega/base_memory_engine.cc |  14 +++
 src/accl/graph/sega/base_memory_engine.hh |   2 +
 src/accl/graph/sega/coalesce_engine.cc    | 105 ++++++++++++----------
 src/accl/graph/sega/coalesce_engine.hh    |   6 +-
 5 files changed, 78 insertions(+), 52 deletions(-)

diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 7fd3591b2c..53c6411de6 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -28,4 +28,5 @@
 Import('*')
 
 DebugFlag('SEGAStructureSize')
-CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine', 'BaseMemEngine'])
+CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine',
+                    'BaseMemEngine', 'BaseMemoryEngine'])
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index e5e78f2c04..9db95d6bd6 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -55,6 +55,20 @@ BaseMemoryEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+BaseMemoryEngine::init()
+{
+    AddrRangeList memory_ranges = memPort.getAddrRanges();
+    // BaseMemoryEngine only supports one memory.
+    assert(memory_ranges.size() == 1);
+
+    peerMemoryRange = memory_ranges.front();
+    DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is %s. "
+                            "The range is %s interleaved.\n", __func__,
+                            peerMemoryRange.to_string(),
+                            peerMemoryRange.interleaved() ? "" : "not");
+}
+
 void
 BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
index 8fb8fde7e6..efbfa5312d 100644
--- a/src/accl/graph/sega/base_memory_engine.hh
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -69,6 +69,7 @@ class BaseMemoryEngine : public ClockedObject
     System* system;
     const RequestorID _requestorId;
 
+    AddrRange peerMemoryRange;
     MemPort memPort;
 
     size_t peerMemoryAtomSize;
@@ -92,6 +93,7 @@ class BaseMemoryEngine : public ClockedObject
 
     void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
 
+    virtual void init() override;
 };
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 678cf0456e..21f048213a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -67,44 +67,48 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     needsPush.reset();
 }
 
-void
-CoalesceEngine::startup()
-{
-    AddrRangeList vertex_ranges = getAddrRanges();
-
-    bool found = false;
-    Addr first_match_addr = 0;
-    while(true) {
-        for (auto range: vertex_ranges) {
-            if (range.contains(first_match_addr)) {
-                found = true;
-                break;
-            }
-        }
-        if (found) {
-            break;
-        }
-        first_match_addr += peerMemoryAtomSize;
-    }
-
-    found = false;
-    Addr second_match_addr = first_match_addr + peerMemoryAtomSize;
-    while(true) {
-        for (auto range: vertex_ranges) {
-            if (range.contains(second_match_addr)) {
-                found = true;
-                break;
-            }
-        }
-        if (found) {
-            break;
-        }
-        second_match_addr += peerMemoryAtomSize;
-    }
-
-    nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
-    memoryAddressOffset = first_match_addr;
-}
+// void
+// CoalesceEngine::startup()
+// {
+//     return;
+    // std::cout << "Hello" << std::endl;
+    // DPRINTF(CoalesceEngine, "%s: Range attached to this engine is %s.\n",
+    //                                 __func__, peerMemoryRange.to_string());
+    // AddrRangeList vertex_ranges = getAddrRanges();
+
+    // bool found = false;
+    // Addr first_match_addr = 0;
+    // while(true) {
+    //     for (auto range: vertex_ranges) {
+    //         if (range.contains(first_match_addr)) {
+    //             found = true;
+    //             break;
+    //         }
+    //     }
+    //     if (found) {
+    //         break;
+    //     }
+    //     first_match_addr += peerMemoryAtomSize;
+    // }
+
+    // found = false;
+    // Addr second_match_addr = first_match_addr + peerMemoryAtomSize;
+    // while(true) {
+    //     for (auto range: vertex_ranges) {
+    //         if (range.contains(second_match_addr)) {
+    //             found = true;
+    //             break;
+    //         }
+    //     }
+    //     if (found) {
+    //         break;
+    //     }
+    //     second_match_addr += peerMemoryAtomSize;
+    // }
+
+    // nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
+    // memoryAddressOffset = first_match_addr;
+// }
 
 void
 CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
@@ -117,7 +121,10 @@ int
 CoalesceEngine::getBlockIndex(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
-    return ((int) (addr / peerMemoryAtomSize)) % numLines;
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    DPRINTF(CoalesceEngine, "%s: Trimming addr: %lu to %lu.\n",
+                                __func__, addr, trimmed_addr);
+    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
 }
 
 // addr should be aligned to peerMemoryAtomSize
@@ -125,10 +132,10 @@ int
 CoalesceEngine::getBitIndexBase(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
-    int atom_index = (int) (addr / (peerMemoryAtomSize * nmpu));
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
     int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
-    int bit_index = atom_index * block_bits;
-    return bit_index;
+    return atom_index * block_bits;
 }
 
 // index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
@@ -136,9 +143,8 @@ Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
 {
     assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
-    Addr block_addr = (nmpu * peerMemoryAtomSize) *
-        ((int)(index / (peerMemoryAtomSize / sizeof(WorkListItem))));
-    return (block_addr + memoryAddressOffset);
+    Addr trimmed_addr = index * sizeof(WorkListItem);
+    return peerMemoryRange.addIntlvBits(trimmed_addr);
 }
 
 bool
@@ -149,7 +155,8 @@ CoalesceEngine::recvWLRead(Addr addr)
                                                     __func__, addr);
     Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
     assert(aligned_addr % peerMemoryAtomSize == 0);
-    int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    int block_index = getBlockIndex(aligned_addr);
     assert(block_index < numLines);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
     assert(wl_offset < numElementsPerLine);
@@ -507,7 +514,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     }
 
     Addr addr = pkt->getAddr();
-    int block_index = (addr / peerMemoryAtomSize) % numLines;
+    // int block_index = (addr / peerMemoryAtomSize) % numLines;
+    int block_index = getBlockIndex(addr);
 
     DPRINTF(CoalesceEngine,  "%s: Received a read resposne for Addr: %lu.\n",
                 __func__, pkt->getAddr());
@@ -591,7 +599,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
     // TODO: Parameterize all the numbers here.
     Addr aligned_addr = roundDown<Addr, Addr>(addr, peerMemoryAtomSize);
-    int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    int block_index = getBlockIndex(aligned_addr);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
     DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index a322379b05..28b204e198 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -91,8 +91,8 @@ class CoalesceEngine : public BaseMemoryEngine
       SenderState(bool is_retry): isRetry(is_retry) {}
     };
 
-    int nmpu;
-    Addr memoryAddressOffset;
+    // int nmpu;
+    // Addr memoryAddressOffset;
 
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
@@ -174,7 +174,7 @@ class CoalesceEngine : public BaseMemoryEngine
 
     void recvPushRetry();
 
-    virtual void startup() override;
+    // virtual void startup() override;
 };
 
 }

From 6c9e7c8d4c68d72742a39a50918f4df35eaa663c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 25 Jul 2022 20:51:48 -0700
Subject: [PATCH 132/247] Refactoring PushEngine to inherit from
 BaseMemoryEngine.

---
 src/accl/graph/sega/CoalesceEngine.py     |   6 +-
 src/accl/graph/sega/PushEngine.py         |  15 ++-
 src/accl/graph/sega/WLEngine.py           |  11 +-
 src/accl/graph/sega/base_memory_engine.hh |  20 ++-
 src/accl/graph/sega/coalesce_engine.hh    |  14 ---
 src/accl/graph/sega/push_engine.cc        | 143 +++++++++++-----------
 src/accl/graph/sega/push_engine.hh        |  17 ++-
 src/accl/graph/sega/wl_engine.cc          |   2 +-
 8 files changed, 117 insertions(+), 111 deletions(-)

diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 536c3477ae..06c6f92750 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -36,7 +36,7 @@ class CoalesceEngine(BaseMemoryEngine):
 
     peer_push_engine = Param.PushEngine(NULL, "PushEngine in the same GPT.")
 
-    cache_size = Param.MemorySize("16KiB", "Size of the internal SRAM array.")
+    cache_size = Param.MemorySize("Size of the internal SRAM array.")
 
-    num_mshr_entry = Param.Int(4, "Number of MSHR entries.")
-    num_tgts_per_mshr = Param.Int(20, "Number of Targets Per MSHR.")
+    num_mshr_entry = Param.Int("Number of MSHR entries.")
+    num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index d3276799aa..447731219e 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -27,13 +27,20 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.BaseMemEngine import BaseMemEngine
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
 
-class PushEngine(BaseMemEngine):
+class PushEngine(BaseMemoryEngine):
     type = 'PushEngine'
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
     req_port  = RequestPort("Port to send updates to the outside")
-    base_edge_addr = Param.Addr("")
-    push_req_queue_size = Param.Int(0, "")
+    base_edge_addr = Param.Addr("The base address for the "
+                                    "attached edge memory")
+    push_req_queue_size = Param.Int("Size of the queue to "
+                                    "queue push requests.")
+    # resp_queue_size should probably be
+    # significantly bigger than push_req_queue_size
+    resp_queue_size = Param.Int("Size of the response queue in the "
+                                    "push engine where it stores the "
+                                    "edges read from memory")
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index cab47fbe7b..98089328f4 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -35,6 +35,11 @@ class WLEngine(BaseReduceEngine):
     cxx_class = 'gem5::WLEngine'
 
     resp_port = ResponsePort("Port to Receive updates from outside")
-    coalesce_engine = Param.CoalesceEngine(NULL, "")
-    update_queue_size = Param.Int(0, "")
-    on_the_fly_update_map_size = Param.Int(4, "") # 4 is arbitrary
+    coalesce_engine = Param.CoalesceEngine(NULL, "The CoalesceEngine "
+                                    "this WLEngine is connected to.")
+    update_queue_size = Param.Int("Size of the queue WLEngine stores "
+                                        "the incoming updates")
+    register_file_size = Param.Int("Number of internal registers the "
+                                    "WLEngine has. It can service as "
+                                    "many updates as this queueu has "
+                                    "entries at the same time.") # 4 is arbitrary
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
index efbfa5312d..5653ede698 100644
--- a/src/accl/graph/sega/base_memory_engine.hh
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -43,7 +43,21 @@ namespace gem5
 
 class BaseMemoryEngine : public ClockedObject
 {
-  private:
+  protected:
+    class MemoryEvent : public EventFunctionWrapper
+    {
+      private:
+        bool _pending;
+      public:
+        MemoryEvent(const std::function<void(void)> &callback,
+                    const std::string &name):
+            EventFunctionWrapper(callback, name), _pending(false)
+        {}
+        bool pending() { return _pending; }
+        void sleep() { _pending = true; }
+        void wake() { _pending = false; }
+    };
+
     class MemPort : public RequestPort
     {
       private:
@@ -65,13 +79,11 @@ class BaseMemoryEngine : public ClockedObject
         virtual void recvReqRetry();
     };
 
-  protected:
     System* system;
     const RequestorID _requestorId;
 
-    AddrRange peerMemoryRange;
     MemPort memPort;
-
+    AddrRange peerMemoryRange;
     size_t peerMemoryAtomSize;
 
     virtual void recvMemRetry() = 0;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 28b204e198..b8cac15f5c 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -47,20 +47,6 @@ class WLEngine;
 class CoalesceEngine : public BaseMemoryEngine
 {
   private:
-    class MemoryEvent : public EventFunctionWrapper
-    {
-      private:
-        bool _pending;
-      public:
-        MemoryEvent(const std::function<void(void)> &callback,
-                    const std::string &name):
-            EventFunctionWrapper(callback, name), _pending(false)
-        {}
-        bool pending() { return _pending; }
-        void sleep() { _pending = true; }
-        void wake() { _pending = false; }
-    };
-
     struct Block
     {
         WorkListItem* items;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index cfebf8e5df..d87462d7dd 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -35,13 +35,15 @@
 namespace gem5
 {
 
-PushEngine::PushEngine(const PushEngineParams &params):
-    BaseMemEngine(params),
+PushEngine::PushEngine(const Params &params):
+    BaseMemoryEngine(params),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
     numTotalRetries(0), numPendingRetries(0),
-    nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
+    onTheFlyMemReqs(0),
+    memRespQueueSize(params.resp_queue_size),
+    nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
     nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
     stats(*this)
@@ -52,10 +54,8 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name == "req_port") {
         return reqPort;
-    } else if (if_name == "mem_port") {
-        return BaseMemEngine::getPort(if_name, idx);
     } else {
-        return SimObject::getPort(if_name, idx);
+        return BaseMemoryEngine::getPort(if_name, idx);
     }
 }
 
@@ -98,9 +98,9 @@ PushEngine::ReqPort::recvReqRetry()
     if (!_blocked) {
         blockedPacket = nullptr;
         DPRINTF(PushEngine, "%s: Sent the blockedPacket. "
-                    "_blocked: %s, (blockedPacket == nullptr): %s.\n",
-                    __func__, _blocked ? "true" : "false",
-                    (blockedPacket == nullptr) ? "true" : "false");
+                "_blocked: %s, (blockedPacket == nullptr): %s.\n",
+                __func__, _blocked ? "true" : "false",
+                (blockedPacket == nullptr) ? "true" : "false");
     }
 }
 
@@ -149,14 +149,9 @@ PushEngine::recvWLItem(WorkListItem wl)
     DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
                             __func__, pushReqQueue.size());
 
-    if ((!nextAddrGenEvent.scheduled())) {
-        if (memQueueFull()) {
-            if (!pendingMemRetry()) {
-                requestMemRetry(1);
-            }
-        } else {
-            schedule(nextAddrGenEvent, nextCycle());
-        }
+    if ((!nextMemoryReadEvent.pending()) &&
+        (!nextMemoryReadEvent.scheduled())) {
+        schedule(nextMemoryReadEvent, nextCycle());
     }
 }
 
@@ -178,67 +173,68 @@ PushEngine::recvWLItemRetry(WorkListItem wl)
                             __func__, pushReqQueue.size());
 
     numTotalRetries--;
-    if ((!nextAddrGenEvent.scheduled())) {
-        if (memQueueFull()) {
-            if (!pendingMemRetry()) {
-                requestMemRetry(1);
-            }
-        } else {
-            schedule(nextAddrGenEvent, nextCycle());
-        }
+    if ((!nextMemoryReadEvent.pending()) &&
+        (!nextMemoryReadEvent.scheduled())) {
+        schedule(nextMemoryReadEvent, nextCycle());
     }
 }
 
 void
-PushEngine::processNextAddrGenEvent()
+PushEngine::processNextMemoryReadEvent()
 {
-    Addr aligned_addr, offset;
-    int num_edges;
-
-    PushPacketInfoGen &curr_info = pushReqQueue.front();
-    std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
-    DPRINTF(PushEngine, "%s: Current packet information generated by "
-                "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
-                "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
-
-    PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
-    reqOffsetMap[pkt->req] = offset;
-    reqNumEdgeMap[pkt->req] = num_edges;
-    reqValueMap[pkt->req] = curr_info.value();
-
-    enqueueMemReq(pkt);
-
-    if (curr_info.done()) {
-        DPRINTF(PushEngine, "%s: Current PushPacketInfoGen is done.\n", __func__);
-        pushReqQueue.pop_front();
-        DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
-                    "pushReqQueue.size() = %u.\n",
-                    __func__, pushReqQueue.size());
-        if (numTotalRetries > 0) {
-            int free_space = pushReqQueueSize -
-            (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
-            DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
-                        " free spaces.\n", __func__, free_space);
-            if ((free_space >= numElementsPerLine) &&
-                (numPendingRetries == 0)) {
-                DPRINTF(PushEngine, "%s: Sent a push retry to "
-                            "peerCoalesceEngine.\n", __func__);
-                if (!nextSendRetryEvent.scheduled()) {
-                    schedule(nextSendRetryEvent, nextCycle());
-                }
-            }
-        }
+    if (memPort.blocked()) {
+        nextMemoryReadEvent.sleep();
+        return;
     }
 
-    if (memQueueFull()) {
-        if (!pushReqQueue.empty()) {
-            requestMemRetry(1);
+    if (memRespQueue.size() < (memRespQueueSize - onTheFlyMemReqs)) {
+        Addr aligned_addr, offset;
+        int num_edges;
+
+        PushPacketInfoGen &curr_info = pushReqQueue.front();
+        std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
+        DPRINTF(PushEngine, "%s: Current packet information generated by "
+                    "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
+                    "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
+
+        PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
+        reqOffsetMap[pkt->req] = offset;
+        reqNumEdgeMap[pkt->req] = num_edges;
+        reqValueMap[pkt->req] = curr_info.value();
+
+        memPort.sendPacket(pkt);
+        onTheFlyMemReqs++;
+
+        if (curr_info.done()) {
+            DPRINTF(PushEngine, "%s: Current PushPacketInfoGen is done.\n", __func__);
+            pushReqQueue.pop_front();
+            DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
+                        "pushReqQueue.size() = %u.\n",
+                        __func__, pushReqQueue.size());
+            if (numTotalRetries > 0) {
+                int free_space = pushReqQueueSize -
+                (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
+                DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
+                            " free spaces.\n", __func__, free_space);
+                if ((free_space >= numElementsPerLine) &&
+                    (numPendingRetries == 0)) {
+                    DPRINTF(PushEngine, "%s: Sent a push retry to "
+                                "peerCoalesceEngine.\n", __func__);
+                    if (!nextSendRetryEvent.scheduled()) {
+                        schedule(nextSendRetryEvent, nextCycle());
+                    }
+                }
+            }
         }
-        return;
     }
 
-    if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
-        schedule(nextAddrGenEvent, nextCycle());
+    // if ((!nextMemoryReadEvent.scheduled()) && (!pushReqQueue.empty())) {
+        // schedule(nextMemoryReadEvent, nextCycle());
+    // }
+    if (!pushReqQueue.empty()) {
+        assert(!nextMemoryReadEvent.pending());
+        assert(!nextMemoryReadEvent.scheduled());
+        schedule(nextMemoryReadEvent, nextCycle());
     }
 }
 
@@ -253,9 +249,11 @@ PushEngine::processNextSendRetryEvent()
 void
 PushEngine::recvMemRetry()
 {
-    assert(!nextAddrGenEvent.scheduled());
-    DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__);
-    schedule(nextAddrGenEvent, nextCycle());
+    if (nextMemoryReadEvent.pending()) {
+        DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__);
+        nextMemoryReadEvent.wake();
+        schedule(nextMemoryReadEvent, nextCycle());
+    }
 }
 
 bool
@@ -264,7 +262,8 @@ PushEngine::handleMemResp(PacketPtr pkt)
     // TODO: in case we need to edit edges, get rid of second statement.
     assert(pkt->isResponse() && (!pkt->isWrite()));
     memRespQueue.push_back(pkt);
-    assert(memRespQueue.size() <= respQueueSize);
+    onTheFlyMemReqs--;
+    assert(memRespQueue.size() <= memRespQueueSize);
 
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
         schedule(nextPushEvent, nextCycle());
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 378cd1a487..9b182e2251 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -29,7 +29,7 @@
 #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
-#include "accl/graph/base/base_mem_engine.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "base/intmath.hh"
 #include "params/PushEngine.hh"
@@ -39,7 +39,7 @@ namespace gem5
 
 class CoalesceEngine;
 
-class PushEngine : public BaseMemEngine
+class PushEngine : public BaseMemoryEngine
 {
   private:
     class PushPacketInfoGen {
@@ -115,15 +115,14 @@ class PushEngine : public BaseMemEngine
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
-    // Since the push engine can process incoming packets faster than
-    // memory can send those packets, the size of this queue will
-    // always be limited by the b/w of the memory.
+    int onTheFlyMemReqs;
+    int memRespQueueSize;
     std::deque<PacketPtr> memRespQueue;
 
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
-    EventFunctionWrapper nextAddrGenEvent;
-    void processNextAddrGenEvent();
+    MemoryEvent nextMemoryReadEvent;
+    void processNextMemoryReadEvent();
 
     EventFunctionWrapper nextPushEvent;
     void processNextPushEvent();
@@ -145,13 +144,12 @@ class PushEngine : public BaseMemEngine
     PushStats stats;
 
   protected:
-    virtual int respBuffSize() { return memRespQueue.size(); }
     virtual void recvMemRetry();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
     PARAMS(PushEngine);
-    PushEngine(const PushEngineParams &params);
+    PushEngine(const Params &params);
 
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
@@ -169,7 +167,6 @@ class PushEngine : public BaseMemEngine
 
     int getNumRetries() { return numTotalRetries; }
 
-    void recvRetryReject() { numPendingRetries--; }
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 2d4ffc9cac..12f4548aa2 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -40,7 +40,7 @@ WLEngine::WLEngine(const WLEngineParams &params):
     respPort(name() + ".resp_port", this),
     coalesceEngine(params.coalesce_engine),
     updateQueueSize(params.update_queue_size),
-    registerFileSize(params.on_the_fly_update_map_size),
+    registerFileSize(params.register_file_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)

From b7e76bfdb113a55311db67e0532495e958b4794b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 26 Jul 2022 09:01:42 -0700
Subject: [PATCH 133/247] Refactored PushEngine to inherit from
 BaseMemoryEngine.

---
 src/accl/graph/SConscript                 |   4 +-
 src/accl/graph/base/BaseMemEngine.py      |  47 ---
 src/accl/graph/base/SConscript            |   3 -
 src/accl/graph/base/base_mem_engine.cc    | 225 --------------
 src/accl/graph/base/base_mem_engine.hh    | 125 --------
 src/accl/graph/sega/base_memory_engine.cc |   4 +
 src/accl/graph/sega/base_memory_engine.hh |   7 +-
 src/accl/graph/sega/coalesce_engine.cc    | 362 +++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh    |   9 +-
 9 files changed, 275 insertions(+), 511 deletions(-)
 delete mode 100644 src/accl/graph/base/BaseMemEngine.py
 delete mode 100644 src/accl/graph/base/base_mem_engine.cc
 delete mode 100644 src/accl/graph/base/base_mem_engine.hh

diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 53c6411de6..5dffd1a396 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -28,5 +28,5 @@
 Import('*')
 
 DebugFlag('SEGAStructureSize')
-CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine',
-                    'BaseMemEngine', 'BaseMemoryEngine'])
+CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
+                    'WLEngine', 'BaseMemoryEngine'])
diff --git a/src/accl/graph/base/BaseMemEngine.py b/src/accl/graph/base/BaseMemEngine.py
deleted file mode 100644
index 2ecb6659d8..0000000000
--- a/src/accl/graph/base/BaseMemEngine.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
-
-class BaseMemEngine(ClockedObject):
-    abstract = True
-    type = 'BaseMemEngine'
-    cxx_header = "accl/graph/base/base_mem_engine.hh"
-    cxx_class = 'gem5::BaseMemEngine'
-
-    system = Param.System(Parent.any, 'System this Engine is a part of')
-    mem_port  = RequestPort("Port to communicate with the memory")
-
-    outstanding_mem_req_queue_size = Param.Int(16, "Capacity of queue in "
-                                    "which memory requests are queued.")
-
-    attached_memory_atom_size = Param.Int(64, "The atom size of the attached "
-                                    "memory.")
-
-    resp_queue_size = Param.Int(64, "blah")
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 45877a12ca..0e43d1aed8 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,9 +27,6 @@
 
 Import('*')
 
-SimObject('BaseMemEngine.py')
 SimObject('BaseReduceEngine.py')
 
-Source('base_mem_engine.cc')
 Source('base_reduce_engine.cc')
-DebugFlag('BaseMemEngine')
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
deleted file mode 100644
index 590307b2bc..0000000000
--- a/src/accl/graph/base/base_mem_engine.cc
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/base_mem_engine.hh"
-
-#include "debug/BaseMemEngine.hh"
-#include "debug/SEGAStructureSize.hh"
-
-namespace gem5
-{
-
-BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
-    ClockedObject(params),
-    system(params.system),
-    memPort(name() + ".mem_port", this),
-    memQueueSize(params.outstanding_mem_req_queue_size),
-    onTheFlyReqs(0),
-    memRetryRequested(false),
-    memSpaceRequested(0),
-    nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
-    respQueueSize(params.resp_queue_size),
-    _requestorId(system->getRequestorId(this)),
-    peerMemoryAtomSize(params.attached_memory_atom_size)
-{}
-
-BaseMemEngine::~BaseMemEngine()
-{}
-
-Port&
-BaseMemEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "mem_port") {
-        return memPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
-}
-
-void
-BaseMemEngine::MemPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-BaseMemEngine::MemPort::recvTimingResp(PacketPtr pkt)
-{
-    //TODO: Investigate sending true all the time
-    return owner->recvTimingResp(pkt);
-}
-
-void
-BaseMemEngine::MemPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-
-    owner->wakeUp();
-}
-
-void
-BaseMemEngine::processNextMemReqEvent()
-{
-    if ((respQueueSize == 0) ||
-        ((respBuffSize() + onTheFlyReqs) < respQueueSize)) {
-        PacketPtr pkt = memQueue.front();
-        memPort.sendPacket(pkt);
-        onTheFlyReqs++;
-        DPRINTF(BaseMemEngine, "%s: Sent a packet to memory with the following info. "
-                    "pkt->addr: %lu, pkt->size: %lu.\n",
-                    __func__, pkt->getAddr(), pkt->getSize());
-        memQueue.pop_front();
-        DPRINTF(SEGAStructureSize, "%s: Popped pkt: %s from "
-                "memQueue. memQueue.size = %d, memQueueSize = %d.\n",
-                __func__, pkt->print(), memQueue.size(), memQueueSize);
-        DPRINTF(BaseMemEngine, "%s: Popped pkt: %s from "
-                "memQueue. memQueue.size = %d, memQueueSize = %d.\n",
-                __func__, pkt->print(), memQueue.size(), memQueueSize);
-        if (memRetryRequested &&
-            (memQueue.size() <=
-            (memQueueSize - memSpaceRequested))) {
-            memRetryRequested = false;
-            memSpaceRequested = 0;
-            recvMemRetry();
-        }
-    }
-
-    if ((!memPort.blocked()) &&
-        (!memQueue.empty()) && (!nextMemReqEvent.scheduled())) {
-        schedule(nextMemReqEvent, nextCycle());
-    }
-}
-
-PacketPtr
-BaseMemEngine::createReadPacket(Addr addr, unsigned int size)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) _requestorId) << 2);
-
-    // Embed it in a packet
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-    pkt->allocate();
-
-    return pkt;
-}
-
-PacketPtr
-BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
-
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) _requestorId) << 2);
-
-    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
-    pkt->allocate();
-    pkt->setData(data);
-
-    return pkt;
-}
-
-bool
-BaseMemEngine::allocateMemQueueSpace(int space)
-{
-    assert((memQueueSize == 0) ||
-        (memQueue.size() <= memQueueSize));
-    return (
-        (memQueueSize == 0) ||
-        (memQueue.size() <= (memQueueSize - space))
-        );
-}
-
-bool
-BaseMemEngine::memQueueFull()
-{
-    assert((memQueueSize == 0) ||
-        (memQueue.size() <= memQueueSize));
-    return (
-        (memQueueSize != 0) &&
-        (memQueue.size() == memQueueSize));
-}
-
-void
-BaseMemEngine::enqueueMemReq(PacketPtr pkt)
-{
-    panic_if(memQueueFull(), "Should not enqueue if queue full.\n");
-    memQueue.push_back(pkt);
-    DPRINTF(SEGAStructureSize, "%s: Pushed pkt: %s to memQueue. "
-                "memQueue.size = %d, memQueueSize = %d.\n", __func__,
-                pkt->print(), memQueue.size(), memQueueSize);
-    DPRINTF(BaseMemEngine, "%s: Pushed pkt: %s to memQueue. "
-                "memQueue.size = %d, memQueueSize = %d.\n", __func__,
-                pkt->print(), memQueue.size(), memQueueSize);
-    if ((!nextMemReqEvent.scheduled()) && (!memPort.blocked())) {
-        schedule(nextMemReqEvent, nextCycle());
-    }
-}
-
-void
-BaseMemEngine::requestMemRetry(int space) {
-    panic_if((memRetryRequested == true) || (memSpaceRequested != 0),
-            "You should not request another alarm without the first one being"
-            "responded to.\n");
-    DPRINTF(BaseMemEngine, "%s: Alarm requested with space = %d.\n", __func__, space);
-    memRetryRequested = true;
-    memSpaceRequested = space;
-}
-
-void
-BaseMemEngine::wakeUp()
-{
-    assert(!nextMemReqEvent.scheduled());
-    if (!memQueue.empty()) {
-        schedule(nextMemReqEvent, nextCycle());
-    }
-}
-
-bool
-BaseMemEngine::recvTimingResp(PacketPtr pkt)
-{
-    onTheFlyReqs--;
-    return handleMemResp(pkt);
-}
-
-}
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
deleted file mode 100644
index 01c862d555..0000000000
--- a/src/accl/graph/base/base_mem_engine.hh
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
-
-#include <unordered_map>
-
-#include "base/addr_range.hh"
-#include "mem/packet.hh"
-#include "mem/port.hh"
-#include "params/BaseMemEngine.hh"
-#include "sim/clocked_object.hh"
-#include "sim/system.hh"
-
-namespace gem5
-{
-
-class BaseMemEngine : public ClockedObject
-{
-  private:
-    class MemPort : public RequestPort
-    {
-      private:
-        BaseMemEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-        public:
-        MemPort(const std::string& name, BaseMemEngine* owner):
-            RequestPort(name, owner), owner(owner),
-            _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-        protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    System* system;
-    MemPort memPort;
-
-    int memQueueSize;
-    int onTheFlyReqs;
-    bool memRetryRequested;
-    int memSpaceRequested;
-    std::deque<PacketPtr> memQueue;
-
-    EventFunctionWrapper nextMemReqEvent;
-    void processNextMemReqEvent();
-
-  protected:
-
-    int respQueueSize;
-    const RequestorID _requestorId;
-
-    size_t peerMemoryAtomSize;
-
-    bool allocateMemQueueSpace(int space);
-    bool memQueueFull();
-
-    bool pendingMemRetry() { return memRetryRequested; }
-    void requestMemRetry(int space);
-
-    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
-    void enqueueMemReq(PacketPtr pkt);
-
-    virtual int respBuffSize() = 0;
-    virtual void recvMemRetry() = 0;
-    virtual bool handleMemResp(PacketPtr pkt) = 0;
-
-    PacketPtr createReadPacket(Addr addr, unsigned int size);
-    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
-
-  public:
-    PARAMS(BaseMemEngine);
-
-    BaseMemEngine(const BaseMemEngineParams &params);
-    ~BaseMemEngine();
-
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
-
-    RequestorID requestorId() { return _requestorId; }
-
-    AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); }
-
-    bool recvTimingResp(PacketPtr pkt);
-    void recvFunctional(PacketPtr pkt);
-
-    void wakeUp();
-
-};
-
-}
-
-#endif // __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index 9db95d6bd6..c60d189e0f 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -77,7 +77,11 @@ BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt)
     {
         blockedPacket = pkt;
         _blocked = true;
+        DPRINTF(BaseMemoryEngine, "%s: MemPort blocked. blockedPacket %s.\n",
+                                            __func__, blockedPacket->print());
     } else {
+        DPRINTF(BaseMemoryEngine, "%s: Packet %s sent successfully.\n",
+                                                __func__, pkt->print());
         owner->recvMemRetry();
     }
 }
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
index 5653ede698..f336edcbf1 100644
--- a/src/accl/graph/sega/base_memory_engine.hh
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -48,14 +48,19 @@ class BaseMemoryEngine : public ClockedObject
     {
       private:
         bool _pending;
+        int _prevState;
+
       public:
         MemoryEvent(const std::function<void(void)> &callback,
                     const std::string &name):
-            EventFunctionWrapper(callback, name), _pending(false)
+            EventFunctionWrapper(callback, name),
+            _pending(false), _prevState(0)
         {}
         bool pending() { return _pending; }
         void sleep() { _pending = true; }
         void wake() { _pending = false; }
+        void setPrevState(int state) { _prevState = state; }
+        int getPrevState() { return _prevState; }
     };
 
     class MemPort : public RequestPort
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 21f048213a..daaed28f1c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -28,6 +28,8 @@
 
 #include "accl/graph/sega/coalesce_engine.hh"
 
+#include <bitset>
+
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
@@ -53,7 +55,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
     nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()),
-    nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
+    nextRecvPushRetryEvent([this] { processNextRecvPushRetryEvent(); }, name()),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -317,6 +319,10 @@ CoalesceEngine::processNextMemoryReadEvent()
         pendingEventQueue.push_back("nextMemoryReadEvent");
         // Maximum three MemoryEvents.
         assert(pendingEventQueue.size() <= 3);
+        DPRINTF(CoalesceEngine, "%s: nextMemoryReadEvent is asleep now and "
+                                    "has been pushed to pendingEventQueue. "
+                                    "pendingEventQueue.size = %d.\n",
+                                    __func__, pendingEventQueue.size());
         return;
     }
 
@@ -366,11 +372,14 @@ CoalesceEngine::processNextRespondEvent()
 void
 CoalesceEngine::recvMemRetry()
 {
+    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
     if (pendingEventQueue.empty()) {
+        DPRINTF(CoalesceEngine, "%s: No events pending.\n", __func__);
         return;
     }
 
     std::string front = pendingEventQueue.front();
+    DPRINTF(CoalesceEngine, "%s: %s is pending MemRetry.\n", __func__, front);
 
     if (front == "nextMemoryReadEvent") {
         assert(!nextMemoryReadEvent.scheduled());
@@ -382,11 +391,11 @@ CoalesceEngine::recvMemRetry()
         assert(nextWriteBackEvent.pending());
         schedule(nextWriteBackEvent, nextCycle());
         nextWriteBackEvent.wake();
-    } else if (front == "nextSendRetryEvent") {
-        assert(!nextSendRetryEvent.scheduled());
-        assert(nextSendRetryEvent.pending());
-        schedule(nextSendRetryEvent, nextCycle());
-        nextSendRetryEvent.wake();
+    } else if (front == "nextRecvPushRetryEvent") {
+        assert(!nextRecvPushRetryEvent.scheduled());
+        assert(nextRecvPushRetryEvent.pending());
+        schedule(nextRecvPushRetryEvent, nextCycle());
+        nextRecvPushRetryEvent.wake();
     } else {
         panic("EVENT IS NOT RECOGNIZED.\n");
     }
@@ -642,14 +651,16 @@ CoalesceEngine::processNextApplyEvent()
     int block_index = applyQueue.front();
 
     if (cacheBlocks[block_index].busyMask != 0) {
-        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid apply process. "
-                    "Therefore, ignoring the apply schedule.\n",
+        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid "
+                    "apply process. Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseApplySchedules++;
     } else if (!cacheBlocks[block_index].dirty) {
-        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has no change. Therefore, no apply "
-                    "needed.\n", __func__, block_index);
+        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has no change. "
+                    "Therefore, no apply needed.\n", __func__, block_index);
     } else {
+        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] could be applied.\n",
+                                                    __func__, block_index);
         for (int i = 0; i < numElementsPerLine; i++) {
             uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
             uint32_t new_prop = std::min(
@@ -683,8 +694,9 @@ CoalesceEngine::processNextApplyEvent()
     if (cacheBlocks[block_index].hasConflict){
         writeBackQueue.push_back(block_index);
         assert(writeBackQueue.size() <= numLines);
-        DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. writeBackQueue.size = %u.\n",
-                __func__, block_index, writeBackQueue.size());
+        DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. "
+                            "writeBackQueue.size = %u.\n", __func__,
+                                block_index, writeBackQueue.size());
     }
 
     applyQueue.pop_front();
@@ -710,6 +722,10 @@ CoalesceEngine::processNextWriteBackEvent()
         pendingEventQueue.push_back("nextWriteBackEvent");
         // Maximum three MemoryEvent.
         assert(pendingEventQueue.size() <= 3);
+        DPRINTF(CoalesceEngine, "%s: nextWriteBackEvent is asleep now and "
+                                    "has been pushed to pendingEventQueue. "
+                                    "pendingEventQueue.size = %d.\n",
+                                    __func__, pendingEventQueue.size());
         return;
     }
 
@@ -774,121 +790,259 @@ void
 CoalesceEngine::recvPushRetry()
 {
     numRetriesReceived++;
+    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     // For now since we do only one retry at a time, we should not receive
     // a retry while this nextSendingRetryEvent is scheduled or is pending.
-    assert(!nextSendRetryEvent.pending());
-    assert(!nextSendRetryEvent.scheduled());
+    assert(!nextRecvPushRetryEvent.pending());
+    assert(!nextRecvPushRetryEvent.scheduled());
     assert(numRetriesReceived == 1);
-    schedule(nextSendRetryEvent, nextCycle());
+    schedule(nextRecvPushRetryEvent, nextCycle());
 }
 
-void
-CoalesceEngine::processNextSendRetryEvent()
+// void
+// CoalesceEngine::processNextRecvPushRetryEvent()
+// {
+//     assert(!nextRecvPushRetryEvent.pending());
+//     assert(needsPush.count() != 0);
+
+//     Addr block_addr = 0;
+//     int block_index = 0;
+//     int it = 0;
+//     uint32_t slice = 0;
+//     bool hit_in_cache = false;
+
+//     for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE;
+//         it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE) {
+//         for (int i = 0; i < numElementsPerLine; i++) {
+//             slice <<= 1;
+//             slice |= needsPush[it + i];
+//         }
+//         if (slice) {
+//             block_addr = getBlockAddrFromBitIndex(it);
+//             block_index = getBlockIndex(block_addr);
+//             if ((cacheBlocks[block_index].addr == block_addr) &&
+//                 (cacheBlocks[block_index].valid)) {
+//                 if (cacheBlocks[block_index].busyMask == 0) {
+//                     hit_in_cache = true;
+//                     break;
+//                 }
+//             } else {
+//                 hit_in_cache = false;
+//                 break;
+//             }
+//         }
+//     }
+
+//     assert(it < MAX_BITVECTOR_SIZE);
+//     if ((it + numElementsPerLine) > MAX_BITVECTOR_SIZE) {
+//         currentBitSliceIndex = 0;
+//     } else {
+//         currentBitSliceIndex = it + numElementsPerLine;
+//     }
+
+//     DPRINTF(CoalesceEngine, "%s: Found slice with value %d at position %d "
+//                         "in needsPush.\n", __func__, slice, it);
+
+//     if (hit_in_cache) {
+//         int push_needed = 0;
+//         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+//                                 __func__, needsPush.count());
+//         assert(peerPushEngine->getNumRetries() == needsPush.count());
+//         for (int i = 0; i < numElementsPerLine; i++) {
+//             // TODO: Make this more programmable
+//             uint32_t new_prop = std::min(
+//                                 cacheBlocks[block_index].items[i].prop,
+//                                 cacheBlocks[block_index].items[i].tempProp);
+//             cacheBlocks[block_index].items[i].tempProp = new_prop;
+//             cacheBlocks[block_index].items[i].prop = new_prop;
+//             if (needsPush[it + i] == 1) {
+//                 peerPushEngine->recvWLItemRetry(
+//                     cacheBlocks[block_index].items[i]);
+//             }
+//             push_needed +=  needsPush[it + i];
+//             needsPush[it + i] = 0;
+//         }
+//         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+//                                 __func__, needsPush.count());
+//         peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
+//         assert(peerPushEngine->getNumRetries() == needsPush.count());
+//         if (applyQueue.find(block_index)) {
+//             applyQueue.erase(block_index);
+//             if (applyQueue.empty() && nextApplyEvent.scheduled()) {
+//                 deschedule(nextApplyEvent);
+//             }
+//             if (cacheBlocks[block_index].hasConflict) {
+//                 writeBackQueue.push_back(block_index);
+//                 assert(writeBackQueue.size() <= numLines);
+//                 if ((!writeBackQueue.empty()) &&
+//                     (!nextWriteBackEvent.pending()) &&
+//                     (!nextWriteBackEvent.scheduled())) {
+//                     schedule(nextWriteBackEvent, nextCycle());
+//                 }
+//             }
+//         }
+//     } else {
+//         if (memPort.blocked()) {
+//             nextRecvPushRetryEvent.sleep();
+//             pendingEventQueue.push_back("nextRecvPushRetryEvent");
+//             // Maximum three MemoryEvent.
+//             assert(pendingEventQueue.size() <= 3);
+//             return;
+//         }
+
+//         // FIXME: Fix the retry mechanism between memory and cache to
+//         // handle memory retries correctly. This probably requires scheduling
+//         // an event for sending the retry. For now we're enabling infinite
+//         // queueing in the memQueue.
+//         // FIXME: Also do not send requests for cache lines that are already
+//         // read but await data. Just set a flag or sth.
+//         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
+//         SenderState* sender_state = new SenderState(true);
+//         pkt->pushSenderState(sender_state);
+//         memPort.sendPacket(pkt);
+//     }
+
+//     numRetriesReceived--;
+//     assert(numRetriesReceived == 0);
+//     assert(!nextRecvPushRetryEvent.scheduled());
+// }
+
+std::tuple<bool, int>
+CoalesceEngine::getOptimalBitVectorSlice()
 {
-    assert(!nextSendRetryEvent.pending());
-    assert(needsPush.count() != 0);
+    bool hit_in_cache;
+    int slice_base = -1;
 
-    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
-    Addr block_addr = 0;
-    int block_index = 0;
-    int it = 0;
-    uint32_t slice = 0;
-    bool hit_in_cache = false;
-
-    for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE;
-        it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE) {
+    int score = 0;
+    uint32_t current_popcount = 0;
+    for (int it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
+        int current_score = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
-            slice <<= 1;
-            slice |= needsPush[it + i];
+            current_popcount += needsPush[it + i];
         }
-        if (slice) {
-            block_addr = getBlockAddrFromBitIndex(it);
-            block_index = getBlockIndex(block_addr);
-            if ((cacheBlocks[block_index].addr == block_addr) &&
-                (cacheBlocks[block_index].valid)) {
-                if (cacheBlocks[block_index].busyMask == 0) {
-                    hit_in_cache = true;
-                    break;
-                }
-            } else {
+        if (current_popcount == 0) {
+            continue;
+        }
+        current_score += current_popcount;
+        Addr addr = getBlockAddrFromBitIndex(it);
+        int block_index = getBlockIndex(addr);
+        if ((cacheBlocks[block_index].valid) &&
+            (cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].busyMask == 0)) {
+            current_score += numElementsPerLine * 2;
+            if (current_score > score) {
+                score = current_score;
+                slice_base = it;
+                hit_in_cache = true;
+            }
+        } else if (!((cacheBlocks[block_index].addr == addr) &&
+                    (cacheBlocks[block_index].allocated))) {
+            score += numElementsPerLine;
+            if (current_score > score) {
+                score = current_score;
+                slice_base = it;
                 hit_in_cache = false;
-                break;
             }
         }
     }
 
-    assert(it < MAX_BITVECTOR_SIZE);
-    if ((it + numElementsPerLine) > MAX_BITVECTOR_SIZE) {
-        currentBitSliceIndex = 0;
-    } else {
-        currentBitSliceIndex = it + numElementsPerLine;
-    }
+    return std::make_tuple(hit_in_cache, slice_base);
+}
+
+void
+CoalesceEngine::processNextRecvPushRetryEvent()
+{
+    bool hit_in_cache;
+    int slice_base;
+    std::tie(hit_in_cache, slice_base)= getOptimalBitVectorSlice();
 
-    DPRINTF(CoalesceEngine, "%s: Found slice with value %d at position %d "
-                        "in needsPush.\n", __func__, slice, it);
+    if (slice_base != -1) {
+        Addr addr = getBlockAddrFromBitIndex(slice_base);
+        int block_index = getBlockIndex(addr);
+        if (hit_in_cache) {
+            assert(cacheBlocks[block_index].valid);
+            assert(cacheBlocks[block_index].busyMask == 0);
+
+            // if nextRecvPushRetryEvent has been blocked by memory before
+            if (nextRecvPushRetryEvent.getPrevState() == -1) {
+                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetry passing "
+                                        "its MemRetry.\n", __func__);
+                recvMemRetry();
+                nextRecvPushRetryEvent.setPrevState(0);
+            }
 
-    if (hit_in_cache) {
-        int push_needed = 0;
-        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-        assert(peerPushEngine->getNumRetries() == needsPush.count());
-        for (int i = 0; i < numElementsPerLine; i++) {
-            // TODO: Make this more programmable
-            uint32_t new_prop = std::min(
+            int push_needed = 0;
+            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                    __func__, needsPush.count());
+            assert(peerPushEngine->getNumRetries() == needsPush.count());
+
+            for (int i = 0; i < numElementsPerLine; i++) {
+                // TODO: Make this more programmable
+                uint32_t new_prop = std::min(
                                 cacheBlocks[block_index].items[i].prop,
                                 cacheBlocks[block_index].items[i].tempProp);
-            cacheBlocks[block_index].items[i].tempProp = new_prop;
-            cacheBlocks[block_index].items[i].prop = new_prop;
-            if (needsPush[it + i] == 1) {
-                peerPushEngine->recvWLItemRetry(
-                    cacheBlocks[block_index].items[i]);
-            }
-            push_needed +=  needsPush[it + i];
-            needsPush[it + i] = 0;
-        }
-        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-        peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
-        assert(peerPushEngine->getNumRetries() == needsPush.count());
-        if (applyQueue.find(block_index)) {
-            applyQueue.erase(block_index);
-            if (applyQueue.empty() && nextApplyEvent.scheduled()) {
-                deschedule(nextApplyEvent);
+                cacheBlocks[block_index].items[i].tempProp = new_prop;
+                cacheBlocks[block_index].items[i].prop = new_prop;
+                if (needsPush[slice_base + i] == 1) {
+                    peerPushEngine->recvWLItemRetry(
+                        cacheBlocks[block_index].items[i]);
+                }
+                push_needed +=  needsPush[slice_base + i];
+                needsPush[slice_base + i] = 0;
             }
-            if (cacheBlocks[block_index].hasConflict) {
-                writeBackQueue.push_back(block_index);
-                assert(writeBackQueue.size() <= numLines);
-                if ((!writeBackQueue.empty()) &&
-                    (!nextWriteBackEvent.pending()) &&
-                    (!nextWriteBackEvent.scheduled())) {
-                    schedule(nextWriteBackEvent, nextCycle());
+            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                    __func__, needsPush.count());
+            peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
+            assert(peerPushEngine->getNumRetries() == needsPush.count());
+            if (applyQueue.find(block_index)) {
+                applyQueue.erase(block_index);
+                if (applyQueue.empty() && nextApplyEvent.scheduled()) {
+                    deschedule(nextApplyEvent);
+                }
+                if (cacheBlocks[block_index].hasConflict) {
+                    writeBackQueue.push_back(block_index);
+                    assert(writeBackQueue.size() <= numLines);
+                    if ((!nextWriteBackEvent.pending()) &&
+                        (!nextWriteBackEvent.scheduled())) {
+                        schedule(nextWriteBackEvent, nextCycle());
+                    }
                 }
             }
-        }
-    } else {
-        if (memPort.blocked()) {
-            nextSendRetryEvent.sleep();
-            pendingEventQueue.push_back("nextSendRetryEvent");
-            // Maximum three MemoryEvent.
-            assert(pendingEventQueue.size() <= 3);
-            return;
-        }
+        } else {
+            if (memPort.blocked()) {
+                assert(nextRecvPushRetryEvent.getPrevState() != -1);
+                nextRecvPushRetryEvent.setPrevState(-1);
+                nextRecvPushRetryEvent.sleep();
+                pendingEventQueue.push_back("nextRecvPushRetryEvent");
+                assert(pendingEventQueue.size() <= 3);
+                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is asleep now "
+                                        "and has been pushed to pendingEventQueue."
+                                        " pendingEventQueue.size = %d.\n",
+                                        __func__, pendingEventQueue.size());
+                return;
+            }
+            // if nextRecvPushRetryEvent has been blocked by memory before
+            if (nextRecvPushRetryEvent.getPrevState() == -1) {
+                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is "
+                    "unblocked by memPort. Setting prevState to 0.\n", __func__);
+                nextRecvPushRetryEvent.setPrevState(0);
+            }
 
-        // FIXME: Fix the retry mechanism between memory and cache to
-        // handle memory retries correctly. This probably requires scheduling
-        // an event for sending the retry. For now we're enabling infinite
-        // queueing in the memQueue.
-        // FIXME: Also do not send requests for cache lines that are already
-        // read but await data. Just set a flag or sth.
-        PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
-        SenderState* sender_state = new SenderState(true);
-        pkt->pushSenderState(sender_state);
-        memPort.sendPacket(pkt);
+            PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+            SenderState* sender_state = new SenderState(true);
+            pkt->pushSenderState(sender_state);
+            memPort.sendPacket(pkt);
+            // TODO: Set a tracking structure so that nextMemoryReadEvent knows
+            // It does not have to read this address anymore. It can simply set
+            // a flag to true (maybe not even needed just look if the cache has a
+            // line allocated for it in the cacheBlocks).
+        }
+        numRetriesReceived--;
+        assert(numRetriesReceived == 0);
+    }
+    if (numRetriesReceived > 0) {
+        schedule(nextRecvPushRetryEvent, nextCycle());
     }
-
-    numRetriesReceived--;
-    assert(numRetriesReceived == 0);
-    assert(!nextSendRetryEvent.scheduled());
 }
 
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index b8cac15f5c..356fee0107 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -106,6 +106,7 @@ class CoalesceEngine : public BaseMemoryEngine
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
+    std::tuple<bool, int> getOptimalBitVectorSlice();
 
     std::deque<std::string> pendingEventQueue;
 
@@ -121,8 +122,8 @@ class CoalesceEngine : public BaseMemoryEngine
     MemoryEvent nextWriteBackEvent;
     void processNextWriteBackEvent();
 
-    MemoryEvent nextSendRetryEvent;
-    void processNextSendRetryEvent();
+    MemoryEvent nextRecvPushRetryEvent;
+    void processNextRecvPushRetryEvent();
 
     struct CoalesceStats : public statistics::Group
     {
@@ -145,8 +146,8 @@ class CoalesceEngine : public BaseMemoryEngine
     CoalesceStats stats;
 
   protected:
-    virtual void recvMemRetry();
-    virtual bool handleMemResp(PacketPtr pkt);
+    virtual void recvMemRetry() override;
+    virtual bool handleMemResp(PacketPtr pkt) override;
 
   public:
     PARAMS(CoalesceEngine);

From 0fc5c5efb512183db2b35cc30217555073973296 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 26 Jul 2022 09:49:11 -0700
Subject: [PATCH 134/247] Making bit vector smaller and choosing slices faster.

---
 src/accl/graph/sega/coalesce_engine.cc | 7 ++++++-
 src/accl/graph/sega/coalesce_engine.hh | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index daaed28f1c..f86d6877ad 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -914,9 +914,10 @@ CoalesceEngine::getOptimalBitVectorSlice()
     int slice_base = -1;
 
     int score = 0;
-    uint32_t current_popcount = 0;
+    int max_score_possible = 3 * numElementsPerLine;
     for (int it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
         int current_score = 0;
+        uint32_t current_popcount = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
             current_popcount += needsPush[it + i];
         }
@@ -934,6 +935,9 @@ CoalesceEngine::getOptimalBitVectorSlice()
                 score = current_score;
                 slice_base = it;
                 hit_in_cache = true;
+                if (score == max_score_possible) {
+                    break;
+                }
             }
         } else if (!((cacheBlocks[block_index].addr == addr) &&
                     (cacheBlocks[block_index].allocated))) {
@@ -942,6 +946,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
                 score = current_score;
                 slice_base = it;
                 hit_in_cache = false;
+                assert(score < max_score_possible);
             }
         }
     }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 356fee0107..f6ed4843fa 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -37,7 +37,7 @@
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
-#define MAX_BITVECTOR_SIZE (1 << 30)
+#define MAX_BITVECTOR_SIZE (1 << 28)
 
 namespace gem5
 {

From ef61dcfccf1e22ea364b6ce13437c9ea9676fceb Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 28 Jul 2022 06:36:15 -0700
Subject: [PATCH 135/247] Merging all memory interactions into one event.

---
 src/accl/graph/sega/coalesce_engine.cc | 559 +++++++++++--------------
 src/accl/graph/sega/coalesce_engine.hh |  24 +-
 2 files changed, 255 insertions(+), 328 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index f86d6877ad..4d7107274b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -45,17 +45,15 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    numMSHREntries(params.num_mshr_entry),
-    numTgtsPerMSHR(params.num_tgts_per_mshr),
-    currentBitSliceIndex(0),
-    numRetriesReceived(0),
-    applyQueue(numLines),
-    writeBackQueue(numLines),
-    nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
+    numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr),
+    numRetriesReceived(0), applyQueue(numLines),
+    // writeBackQueue(numLines),
+    nextMemoryEvent([this] { processNextMemoryEvent(); }, name()),
+    // nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
-    nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()),
-    nextRecvPushRetryEvent([this] { processNextRecvPushRetryEvent(); }, name()),
+    // nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()),
+    // nextRecvPushRetryEvent([this] { processNextRecvPushRetryEvent(); }, name()),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -69,49 +67,6 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     needsPush.reset();
 }
 
-// void
-// CoalesceEngine::startup()
-// {
-//     return;
-    // std::cout << "Hello" << std::endl;
-    // DPRINTF(CoalesceEngine, "%s: Range attached to this engine is %s.\n",
-    //                                 __func__, peerMemoryRange.to_string());
-    // AddrRangeList vertex_ranges = getAddrRanges();
-
-    // bool found = false;
-    // Addr first_match_addr = 0;
-    // while(true) {
-    //     for (auto range: vertex_ranges) {
-    //         if (range.contains(first_match_addr)) {
-    //             found = true;
-    //             break;
-    //         }
-    //     }
-    //     if (found) {
-    //         break;
-    //     }
-    //     first_match_addr += peerMemoryAtomSize;
-    // }
-
-    // found = false;
-    // Addr second_match_addr = first_match_addr + peerMemoryAtomSize;
-    // while(true) {
-    //     for (auto range: vertex_ranges) {
-    //         if (range.contains(second_match_addr)) {
-    //             found = true;
-    //             break;
-    //         }
-    //     }
-    //     if (found) {
-    //         break;
-    //     }
-    //     second_match_addr += peerMemoryAtomSize;
-    // }
-
-    // nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
-    // memoryAddressOffset = first_match_addr;
-// }
-
 void
 CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
 {
@@ -260,15 +215,20 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
 
-                    fillQueue.push_back(block_index);
-                    assert(fillQueue.size() <= numLines);
+                    // fillQueue.push_back(block_index);
+                    // assert(fillQueue.size() <= numLines);
+                    memoryFunctionQueue.emplace_back([this] (int block_index) { processNextMemoryReadEvent(block_index); }, block_index);
                     // FIXME: Fix this DPRINTF
                     // DPRINTF(CoalesceEngine,  "%s: Pushed pkt index  "
                     //         "lineFillBuffer. lineFillBuffer.size = %d.\n",
                     //         __func__, fillQueue.size());
-                    if ((!nextMemoryReadEvent.pending()) &&
-                        (!nextMemoryReadEvent.scheduled())) {
-                        schedule(nextMemoryReadEvent, nextCycle());
+                    // if ((!nextMemoryReadEvent.pending()) &&
+                    //     (!nextMemoryReadEvent.scheduled())) {
+                    //     schedule(nextMemoryReadEvent, nextCycle());
+                    // }
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
                     }
                     stats.readMisses++;
                     stats.numVertexReads++;
@@ -309,24 +269,24 @@ CoalesceEngine::recvWLRead(Addr addr)
 }
 
 void
-CoalesceEngine::processNextMemoryReadEvent()
+CoalesceEngine::processNextMemoryReadEvent(int block_index)
 {
-    assert(!nextMemoryReadEvent.pending());
-    if (memPort.blocked()) {
-        // TODO: Implement interface where events of the CoalesceEngine are
-        // pushed to a fifo to be scheduled later.
-        nextMemoryReadEvent.sleep();
-        pendingEventQueue.push_back("nextMemoryReadEvent");
-        // Maximum three MemoryEvents.
-        assert(pendingEventQueue.size() <= 3);
-        DPRINTF(CoalesceEngine, "%s: nextMemoryReadEvent is asleep now and "
-                                    "has been pushed to pendingEventQueue. "
-                                    "pendingEventQueue.size = %d.\n",
-                                    __func__, pendingEventQueue.size());
-        return;
-    }
+    // assert(!nextMemoryReadEvent.pending());
+    // if (memPort.blocked()) {
+    //     // TODO: Implement interface where events of the CoalesceEngine are
+    //     // pushed to a fifo to be scheduled later.
+    //     nextMemoryReadEvent.sleep();
+    //     pendingEventQueue.push_back("nextMemoryReadEvent");
+    //     // Maximum three MemoryEvents.
+    //     assert(pendingEventQueue.size() <= 3);
+    //     DPRINTF(CoalesceEngine, "%s: nextMemoryReadEvent is asleep now and "
+    //                                 "has been pushed to pendingEventQueue. "
+    //                                 "pendingEventQueue.size = %d.\n",
+    //                                 __func__, pendingEventQueue.size());
+    //     return;
+    // }
 
-    int block_index = fillQueue.front();
+    // int block_index = fillQueue.front();
     PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
                                     peerMemoryAtomSize);
     DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
@@ -334,13 +294,11 @@ CoalesceEngine::processNextMemoryReadEvent()
 
     memPort.sendPacket(pkt);
 
-    fillQueue.pop_front();
+    // fillQueue.pop_front();
 
-    if (!fillQueue.empty()) {
-        assert(!nextMemoryReadEvent.scheduled());
-        assert(!nextMemoryReadEvent.pending());
-        schedule(nextMemoryReadEvent, nextCycle());
-    }
+    // if (!fillQueue.empty()) {
+    //     memoryFunctionQueue.push_back([this] { processNextMemoryReadEvent(); });
+    // }
 }
 
 // TODO: For loop to empty the entire responseQueue.
@@ -370,38 +328,70 @@ CoalesceEngine::processNextRespondEvent()
 }
 
 void
-CoalesceEngine::recvMemRetry()
+CoalesceEngine::processNextMemoryEvent()
 {
-    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
-    if (pendingEventQueue.empty()) {
-        DPRINTF(CoalesceEngine, "%s: No events pending.\n", __func__);
+    if (memPort.blocked()) {
+        nextMemoryEvent.sleep();
         return;
     }
 
-    std::string front = pendingEventQueue.front();
-    DPRINTF(CoalesceEngine, "%s: %s is pending MemRetry.\n", __func__, front);
-
-    if (front == "nextMemoryReadEvent") {
-        assert(!nextMemoryReadEvent.scheduled());
-        assert(nextMemoryReadEvent.pending());
-        schedule(nextMemoryReadEvent, nextCycle());
-        nextMemoryReadEvent.wake();
-    } else if (front == "nextWriteBackEvent") {
-        assert(!nextWriteBackEvent.scheduled());
-        assert(nextWriteBackEvent.pending());
-        schedule(nextWriteBackEvent, nextCycle());
-        nextWriteBackEvent.wake();
-    } else if (front == "nextRecvPushRetryEvent") {
-        assert(!nextRecvPushRetryEvent.scheduled());
-        assert(nextRecvPushRetryEvent.pending());
-        schedule(nextRecvPushRetryEvent, nextCycle());
-        nextRecvPushRetryEvent.wake();
-    } else {
-        panic("EVENT IS NOT RECOGNIZED.\n");
+    std::function<void(int)> next_memory_function;
+    int next_memory_function_input;
+    std::tie(next_memory_function, next_memory_function_input) = memoryFunctionQueue.front();
+    next_memory_function(next_memory_function_input);
+    memoryFunctionQueue.pop_front();
+    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
+                                "memoryFunctionQueue.size = %d.\n", __func__,
+                                memoryFunctionQueue.size());
+
+    assert(!nextMemoryEvent.pending());
+    assert(!nextMemoryEvent.scheduled());
+    if ((!memoryFunctionQueue.empty())) {
+        schedule(nextMemoryEvent, nextCycle());
     }
+}
 
-    pendingEventQueue.pop_front();
-    return;
+void
+CoalesceEngine::recvMemRetry()
+{
+    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
+    // if (pendingEventQueue.empty()) {
+    //     DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
+    //     return;
+    // }
+
+    // std::string front = pendingEventQueue.front();
+    // DPRINTF(CoalesceEngine, "%s: %s is pending MemRetry.\n", __func__, front);
+
+    // if (front == "nextMemoryReadEvent") {
+    //     assert(!nextMemoryReadEvent.scheduled());
+    //     assert(nextMemoryReadEvent.pending());
+    //     schedule(nextMemoryReadEvent, nextCycle());
+    //     nextMemoryReadEvent.wake();
+    // } else if (front == "nextWriteBackEvent") {
+    //     assert(!nextWriteBackEvent.scheduled());
+    //     assert(nextWriteBackEvent.pending());
+    //     schedule(nextWriteBackEvent, nextCycle());
+    //     nextWriteBackEvent.wake();
+    // } else if (front == "nextRecvPushRetryEvent") {
+    //     assert(!nextRecvPushRetryEvent.scheduled());
+    //     assert(nextRecvPushRetryEvent.pending());
+    //     schedule(nextRecvPushRetryEvent, nextCycle());
+    //     nextRecvPushRetryEvent.wake();
+    // } else {
+    //     panic("EVENT IS NOT RECOGNIZED.\n");
+    // }
+
+    // pendingEventQueue.pop_front();
+    // return;
+
+    if (!nextMemoryEvent.pending()) {
+        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
+        return;
+    }
+    assert(!nextMemoryEvent.scheduled());
+    nextMemoryEvent.wake();
+    schedule(nextMemoryEvent, nextCycle());
 }
 
 // FIXME: Fix this function.
@@ -464,12 +454,17 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         deschedule(nextApplyEvent);
                     }
                     if (cacheBlocks[block_index].hasConflict) {
-                        writeBackQueue.push_back(block_index);
-                        assert(writeBackQueue.size() <= numLines);
-                        if ((!nextWriteBackEvent.pending()) &&
-                            (!nextWriteBackEvent.scheduled())) {
-                            schedule(nextWriteBackEvent, nextCycle());
-                        }
+                        // writeBackQueue.push_back(block_index);
+                        // assert(writeBackQueue.size() <= numLines);
+                        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
+                        // if ((!nextWriteBackEvent.pending()) &&
+                        //     (!nextWriteBackEvent.scheduled())) {
+                        //     schedule(nextWriteBackEvent, nextCycle());
+                        // }
+                        // if ((!nextMemoryEvent.pending()) &&
+                        //     (!nextMemoryEvent.scheduled())) {
+                        //     schedule(nextMemoryEvent, nextCycle());
+                        // }
                     }
                 }
             } else {
@@ -528,9 +523,12 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
     DPRINTF(CoalesceEngine,  "%s: Received a read resposne for Addr: %lu.\n",
                 __func__, pkt->getAddr());
-    assert((cacheBlocks[block_index].allocated) && // allocated cache block
-            (!cacheBlocks[block_index].valid) &&    // valid is false
-            (!(MSHR.find(block_index) == MSHR.end()))); // allocated MSHR
+    // assert((cacheBlocks[block_index].allocated) && // allocated cache block
+    //         (!cacheBlocks[block_index].valid) &&    // valid is false
+    //         (!(MSHR.find(block_index) == MSHR.end()))); // allocated MSHR
+    assert(cacheBlocks[block_index].allocated);
+    assert(!cacheBlocks[block_index].valid);
+    assert(MSHR.find(block_index) != MSHR.end());
     pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
                                                 peerMemoryAtomSize);
 
@@ -691,22 +689,21 @@ CoalesceEngine::processNextApplyEvent()
     }
 
     // TODO: This is where eviction policy goes
-    if (cacheBlocks[block_index].hasConflict){
-        writeBackQueue.push_back(block_index);
-        assert(writeBackQueue.size() <= numLines);
-        DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. "
-                            "writeBackQueue.size = %u.\n", __func__,
-                                block_index, writeBackQueue.size());
+    if ((cacheBlocks[block_index].hasConflict) &&
+        (cacheBlocks[block_index].busyMask == 0)) {
+        // writeBackQueue.push_back(block_index);
+        // assert(writeBackQueue.size() <= numLines);
+        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
+        // DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. "
+        //                     "writeBackQueue.size = %u.\n", __func__,
+        //                         block_index, writeBackQueue.size());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
     }
 
     applyQueue.pop_front();
-
-    if ((!writeBackQueue.empty()) &&
-        (!nextWriteBackEvent.pending()) &&
-        (!nextWriteBackEvent.scheduled())) {
-        schedule(nextWriteBackEvent, nextCycle());
-    }
-
     if ((!applyQueue.empty()) &&
         (!nextApplyEvent.scheduled())) {
         schedule(nextApplyEvent, nextCycle());
@@ -714,22 +711,22 @@ CoalesceEngine::processNextApplyEvent()
 }
 
 void
-CoalesceEngine::processNextWriteBackEvent()
+CoalesceEngine::processNextWriteBackEvent(int block_index)
 {
-    assert(!nextWriteBackEvent.pending());
-    if (memPort.blocked()) {
-        nextWriteBackEvent.sleep();
-        pendingEventQueue.push_back("nextWriteBackEvent");
-        // Maximum three MemoryEvent.
-        assert(pendingEventQueue.size() <= 3);
-        DPRINTF(CoalesceEngine, "%s: nextWriteBackEvent is asleep now and "
-                                    "has been pushed to pendingEventQueue. "
-                                    "pendingEventQueue.size = %d.\n",
-                                    __func__, pendingEventQueue.size());
-        return;
-    }
+    // assert(!nextWriteBackEvent.pending());
+    // if (memPort.blocked()) {
+    //     nextWriteBackEvent.sleep();
+    //     pendingEventQueue.push_back("nextWriteBackEvent");
+    //     // Maximum three MemoryEvent.
+    //     assert(pendingEventQueue.size() <= 3);
+    //     DPRINTF(CoalesceEngine, "%s: nextWriteBackEvent is asleep now and "
+    //                                 "has been pushed to pendingEventQueue. "
+    //                                 "pendingEventQueue.size = %d.\n",
+    //                                 __func__, pendingEventQueue.size());
+    //     return;
+    // }
 
-    int block_index = writeBackQueue.front();
+    // int block_index = writeBackQueue.front();
 
     // Why would we write it back if it does not have a conflict?
     assert(cacheBlocks[block_index].hasConflict);
@@ -769,21 +766,35 @@ CoalesceEngine::processNextWriteBackEvent()
         cacheBlocks[block_index].dirty = false;
         DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for "
                 "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
-        fillQueue.push_back(block_index);
-        assert(fillQueue.size() <= numLines);
-        if ((!nextMemoryReadEvent.pending()) &&
-            (!nextMemoryReadEvent.scheduled())){
-            schedule(nextMemoryReadEvent, nextCycle());
-        }
+        // fillQueue.push_back(block_index);
+        // assert(fillQueue.size() <= numLines);
+        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextMemoryReadEvent(block_index); }, block_index);
+        // if ((!nextMemoryReadEvent.pending()) &&
+        //     (!nextMemoryReadEvent.scheduled())){
+        //     schedule(nextMemoryReadEvent, nextCycle());
+        // }
+        // if ((!nextMemoryEvent.pending()) &&
+        //     (!nextMemoryEvent.scheduled())) {
+        //     schedule(nextMemoryEvent, nextCycle());
+        // }
     }
 
-    writeBackQueue.pop_front();
-
-    if (!writeBackQueue.empty()) {
-        assert(!nextWriteBackEvent.pending());
-        assert(!nextWriteBackEvent.scheduled());
-        schedule(nextWriteBackEvent, nextCycle());
-    }
+    // writeBackQueue.pop_front();
+    // assert(writeBackQueue.size() <= numLines);
+    // DPRINTF(CoalesceEngine, "%s: Popped %d from writeBackQueue. "
+    //                 "writeBackQueue.size = %d, writeBackQueueSize = %d.\n",
+    //                 __func__, block_index, writeBackQueue.size(), numLines);
+
+    // if (!writeBackQueue.empty()) {
+        // assert(!nextWriteBackEvent.pending());
+        // assert(!nextWriteBackEvent.scheduled());
+        // schedule(nextWriteBackEvent, nextCycle());
+        // memoryFunctionQueue.push_back([this] { processNextWriteBackEvent(); });
+        // if ((!nextMemoryEvent.pending()) &&
+        //     (!nextMemoryEvent.scheduled())) {
+        //     schedule(nextMemoryEvent, nextCycle());
+        // }
+    // }
 }
 
 void
@@ -793,130 +804,28 @@ CoalesceEngine::recvPushRetry()
     DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     // For now since we do only one retry at a time, we should not receive
     // a retry while this nextSendingRetryEvent is scheduled or is pending.
-    assert(!nextRecvPushRetryEvent.pending());
-    assert(!nextRecvPushRetryEvent.scheduled());
+    // assert(!nextRecvPushRetryEvent.pending());
+    // assert(!nextRecvPushRetryEvent.scheduled());
     assert(numRetriesReceived == 1);
-    schedule(nextRecvPushRetryEvent, nextCycle());
+    // schedule(nextRecvPushRetryEvent, nextCycle());
+    // TODO: Pass slice_base to getOptimalBitVectorSlice
+    memoryFunctionQueue.emplace_back([this] (int slice_base) { processNextRecvPushRetryEvent(slice_base); }, 0);
+    if ((!nextMemoryEvent.pending()) &&
+        (!nextMemoryEvent.scheduled())) {
+        schedule(nextMemoryEvent, nextCycle());
+    }
 }
 
-// void
-// CoalesceEngine::processNextRecvPushRetryEvent()
-// {
-//     assert(!nextRecvPushRetryEvent.pending());
-//     assert(needsPush.count() != 0);
-
-//     Addr block_addr = 0;
-//     int block_index = 0;
-//     int it = 0;
-//     uint32_t slice = 0;
-//     bool hit_in_cache = false;
-
-//     for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE;
-//         it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE) {
-//         for (int i = 0; i < numElementsPerLine; i++) {
-//             slice <<= 1;
-//             slice |= needsPush[it + i];
-//         }
-//         if (slice) {
-//             block_addr = getBlockAddrFromBitIndex(it);
-//             block_index = getBlockIndex(block_addr);
-//             if ((cacheBlocks[block_index].addr == block_addr) &&
-//                 (cacheBlocks[block_index].valid)) {
-//                 if (cacheBlocks[block_index].busyMask == 0) {
-//                     hit_in_cache = true;
-//                     break;
-//                 }
-//             } else {
-//                 hit_in_cache = false;
-//                 break;
-//             }
-//         }
-//     }
-
-//     assert(it < MAX_BITVECTOR_SIZE);
-//     if ((it + numElementsPerLine) > MAX_BITVECTOR_SIZE) {
-//         currentBitSliceIndex = 0;
-//     } else {
-//         currentBitSliceIndex = it + numElementsPerLine;
-//     }
-
-//     DPRINTF(CoalesceEngine, "%s: Found slice with value %d at position %d "
-//                         "in needsPush.\n", __func__, slice, it);
-
-//     if (hit_in_cache) {
-//         int push_needed = 0;
-//         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-//                                 __func__, needsPush.count());
-//         assert(peerPushEngine->getNumRetries() == needsPush.count());
-//         for (int i = 0; i < numElementsPerLine; i++) {
-//             // TODO: Make this more programmable
-//             uint32_t new_prop = std::min(
-//                                 cacheBlocks[block_index].items[i].prop,
-//                                 cacheBlocks[block_index].items[i].tempProp);
-//             cacheBlocks[block_index].items[i].tempProp = new_prop;
-//             cacheBlocks[block_index].items[i].prop = new_prop;
-//             if (needsPush[it + i] == 1) {
-//                 peerPushEngine->recvWLItemRetry(
-//                     cacheBlocks[block_index].items[i]);
-//             }
-//             push_needed +=  needsPush[it + i];
-//             needsPush[it + i] = 0;
-//         }
-//         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-//                                 __func__, needsPush.count());
-//         peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
-//         assert(peerPushEngine->getNumRetries() == needsPush.count());
-//         if (applyQueue.find(block_index)) {
-//             applyQueue.erase(block_index);
-//             if (applyQueue.empty() && nextApplyEvent.scheduled()) {
-//                 deschedule(nextApplyEvent);
-//             }
-//             if (cacheBlocks[block_index].hasConflict) {
-//                 writeBackQueue.push_back(block_index);
-//                 assert(writeBackQueue.size() <= numLines);
-//                 if ((!writeBackQueue.empty()) &&
-//                     (!nextWriteBackEvent.pending()) &&
-//                     (!nextWriteBackEvent.scheduled())) {
-//                     schedule(nextWriteBackEvent, nextCycle());
-//                 }
-//             }
-//         }
-//     } else {
-//         if (memPort.blocked()) {
-//             nextRecvPushRetryEvent.sleep();
-//             pendingEventQueue.push_back("nextRecvPushRetryEvent");
-//             // Maximum three MemoryEvent.
-//             assert(pendingEventQueue.size() <= 3);
-//             return;
-//         }
-
-//         // FIXME: Fix the retry mechanism between memory and cache to
-//         // handle memory retries correctly. This probably requires scheduling
-//         // an event for sending the retry. For now we're enabling infinite
-//         // queueing in the memQueue.
-//         // FIXME: Also do not send requests for cache lines that are already
-//         // read but await data. Just set a flag or sth.
-//         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
-//         SenderState* sender_state = new SenderState(true);
-//         pkt->pushSenderState(sender_state);
-//         memPort.sendPacket(pkt);
-//     }
-
-//     numRetriesReceived--;
-//     assert(numRetriesReceived == 0);
-//     assert(!nextRecvPushRetryEvent.scheduled());
-// }
-
 std::tuple<bool, int>
 CoalesceEngine::getOptimalBitVectorSlice()
 {
-    bool hit_in_cache;
+    bool hit_in_cache = false;
     int slice_base = -1;
 
-    int score = 0;
-    int max_score_possible = 3 * numElementsPerLine;
+    // int score = 0;
+    // int max_score_possible = 3 * numElementsPerLine;
     for (int it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
-        int current_score = 0;
+        // int current_score = 0;
         uint32_t current_popcount = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
             current_popcount += needsPush[it + i];
@@ -924,30 +833,32 @@ CoalesceEngine::getOptimalBitVectorSlice()
         if (current_popcount == 0) {
             continue;
         }
-        current_score += current_popcount;
+        // current_score += current_popcount;
         Addr addr = getBlockAddrFromBitIndex(it);
         int block_index = getBlockIndex(addr);
         if ((cacheBlocks[block_index].valid) &&
             (cacheBlocks[block_index].addr == addr) &&
             (cacheBlocks[block_index].busyMask == 0)) {
-            current_score += numElementsPerLine * 2;
-            if (current_score > score) {
-                score = current_score;
-                slice_base = it;
-                hit_in_cache = true;
-                if (score == max_score_possible) {
-                    break;
-                }
-            }
+            // current_score += numElementsPerLine * 2;
+            // if (current_score > score) {
+            //     score = current_score;
+            //     slice_base = it;
+            //     hit_in_cache = true;
+            //     if (score == max_score_possible) {
+            //         break;
+            //     }
+            // }
+            return std::make_tuple(true, it);
         } else if (!((cacheBlocks[block_index].addr == addr) &&
                     (cacheBlocks[block_index].allocated))) {
-            score += numElementsPerLine;
-            if (current_score > score) {
-                score = current_score;
-                slice_base = it;
-                hit_in_cache = false;
-                assert(score < max_score_possible);
-            }
+            // score += numElementsPerLine;
+            // if (current_score > score) {
+            //     score = current_score;
+            //     slice_base = it;
+            //     hit_in_cache = false;
+            //     assert(score < max_score_possible);
+            // }
+            return std::make_tuple(false, it);
         }
     }
 
@@ -955,11 +866,11 @@ CoalesceEngine::getOptimalBitVectorSlice()
 }
 
 void
-CoalesceEngine::processNextRecvPushRetryEvent()
+CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2)
 {
     bool hit_in_cache;
     int slice_base;
-    std::tie(hit_in_cache, slice_base)= getOptimalBitVectorSlice();
+    std::tie(hit_in_cache, slice_base) = getOptimalBitVectorSlice();
 
     if (slice_base != -1) {
         Addr addr = getBlockAddrFromBitIndex(slice_base);
@@ -969,12 +880,12 @@ CoalesceEngine::processNextRecvPushRetryEvent()
             assert(cacheBlocks[block_index].busyMask == 0);
 
             // if nextRecvPushRetryEvent has been blocked by memory before
-            if (nextRecvPushRetryEvent.getPrevState() == -1) {
-                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetry passing "
-                                        "its MemRetry.\n", __func__);
-                recvMemRetry();
-                nextRecvPushRetryEvent.setPrevState(0);
-            }
+            // if (nextRecvPushRetryEvent.getPrevState() == -1) {
+            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetry passing "
+            //                             "its MemRetry.\n", __func__);
+            //     recvMemRetry();
+            //     nextRecvPushRetryEvent.setPrevState(0);
+            // }
 
             int push_needed = 0;
             DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
@@ -1005,33 +916,38 @@ CoalesceEngine::processNextRecvPushRetryEvent()
                     deschedule(nextApplyEvent);
                 }
                 if (cacheBlocks[block_index].hasConflict) {
-                    writeBackQueue.push_back(block_index);
-                    assert(writeBackQueue.size() <= numLines);
-                    if ((!nextWriteBackEvent.pending()) &&
-                        (!nextWriteBackEvent.scheduled())) {
-                        schedule(nextWriteBackEvent, nextCycle());
-                    }
+                    // writeBackQueue.push_back(block_index);
+                    // assert(writeBackQueue.size() <= numLines);
+                    // if ((!nextWriteBackEvent.pending()) &&
+                    //     (!nextWriteBackEvent.scheduled())) {
+                    //     schedule(nextWriteBackEvent, nextCycle());
+                    // }
+                    memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
+                    // if ((!nextMemoryEvent.pending()) &&
+                    //     (!nextMemoryEvent.scheduled())) {
+                    //     schedule(nextMemoryEvent, nextCycle());
+                    // }
                 }
             }
         } else {
-            if (memPort.blocked()) {
-                assert(nextRecvPushRetryEvent.getPrevState() != -1);
-                nextRecvPushRetryEvent.setPrevState(-1);
-                nextRecvPushRetryEvent.sleep();
-                pendingEventQueue.push_back("nextRecvPushRetryEvent");
-                assert(pendingEventQueue.size() <= 3);
-                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is asleep now "
-                                        "and has been pushed to pendingEventQueue."
-                                        " pendingEventQueue.size = %d.\n",
-                                        __func__, pendingEventQueue.size());
-                return;
-            }
+            // if (memPort.blocked()) {
+            //     // assert(nextRecvPushRetryEvent.getPrevState() != -1);
+            //     nextRecvPushRetryEvent.setPrevState(-1);
+            //     nextRecvPushRetryEvent.sleep();
+            //     pendingEventQueue.push_back("nextRecvPushRetryEvent");
+            //     assert(pendingEventQueue.size() <= 3);
+            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is asleep now "
+            //                             "and has been pushed to pendingEventQueue."
+            //                             " pendingEventQueue.size = %d.\n",
+            //                             __func__, pendingEventQueue.size());
+            //     return;
+            // }
             // if nextRecvPushRetryEvent has been blocked by memory before
-            if (nextRecvPushRetryEvent.getPrevState() == -1) {
-                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is "
-                    "unblocked by memPort. Setting prevState to 0.\n", __func__);
-                nextRecvPushRetryEvent.setPrevState(0);
-            }
+            // if (nextRecvPushRetryEvent.getPrevState() == -1) {
+            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is "
+            //         "unblocked by memPort. Setting prevState to 0.\n", __func__);
+            //     nextRecvPushRetryEvent.setPrevState(0);
+            // }
 
             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
             SenderState* sender_state = new SenderState(true);
@@ -1045,8 +961,15 @@ CoalesceEngine::processNextRecvPushRetryEvent()
         numRetriesReceived--;
         assert(numRetriesReceived == 0);
     }
+    // if (numRetriesReceived > 0) {
+    //     schedule(nextRecvPushRetryEvent, nextCycle());
+    // }
     if (numRetriesReceived > 0) {
-        schedule(nextRecvPushRetryEvent, nextCycle());
+        memoryFunctionQueue.emplace_back([this] (int slice_base) { processNextRecvPushRetryEvent(slice_base); }, 0);
+        // if ((!nextMemoryEvent.pending()) &&
+        //     (!nextMemoryEvent.scheduled())) {
+        //     schedule(nextMemoryEvent, nextCycle());
+        // }
     }
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index f6ed4843fa..4036dc49af 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -92,26 +92,30 @@ class CoalesceEngine : public BaseMemoryEngine
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHR;
 
-    std::deque<int> fillQueue;
+    // std::deque<int> fillQueue;
 
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
-    int currentBitSliceIndex;
     int numRetriesReceived;
     InOutSet<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
-    InOutSet<int> writeBackQueue;
+    // InOutSet<int> writeBackQueue;
+
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
     std::tuple<bool, int> getOptimalBitVectorSlice();
 
-    std::deque<std::string> pendingEventQueue;
+    // std::deque<std::string> pendingEventQueue;
+
+    std::deque<std::tuple<std::function<void(int)>, int>> memoryFunctionQueue;
+    MemoryEvent nextMemoryEvent;
+    void processNextMemoryEvent();
 
-    MemoryEvent nextMemoryReadEvent;
-    void processNextMemoryReadEvent();
+    // MemoryEvent nextMemoryReadEvent;
+    void processNextMemoryReadEvent(int block_index);
 
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
@@ -119,11 +123,11 @@ class CoalesceEngine : public BaseMemoryEngine
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
-    MemoryEvent nextWriteBackEvent;
-    void processNextWriteBackEvent();
+    // MemoryEvent nextWriteBackEvent;
+    void processNextWriteBackEvent(int block_index);
 
-    MemoryEvent nextRecvPushRetryEvent;
-    void processNextRecvPushRetryEvent();
+    // MemoryEvent nextRecvPushRetryEvent;
+    void processNextRecvPushRetryEvent(int slice_base);
 
     struct CoalesceStats : public statistics::Group
     {

From d00c61008d8ee2157b711441cd71a34ab32bb108 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 29 Jul 2022 10:59:33 -0700
Subject: [PATCH 136/247] Adding more dprintfs.

---
 src/accl/graph/base/data_structs.hh       |  36 +-
 src/accl/graph/sega/base_memory_engine.cc |   8 +-
 src/accl/graph/sega/coalesce_engine.cc    | 676 ++++++++--------------
 src/accl/graph/sega/coalesce_engine.hh    |  36 +-
 4 files changed, 275 insertions(+), 481 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index f178d5a7e2..707b57c56f 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -32,9 +32,7 @@
 #include "base/cprintf.hh"
 #include "base/intmath.hh"
 
-#include <bitset>
-#include <queue>
-#include <unordered_set>
+#include <list>
 
 namespace gem5
 {
@@ -90,49 +88,51 @@ static_assert(isPowerOf2(sizeof(WorkListItem)));
 static_assert(isPowerOf2(sizeof(Edge)));
 
 template<typename T>
-class InOutSet
+class UniqueFIFO
 {
   private:
-    std::unordered_set<T> set;
+    std::list<T> fifo;
 
   public:
-    InOutSet(int cap)
-    {
-        set.reserve(cap);
-    }
+    UniqueFIFO() {}
 
     void push_back(T item)
     {
-        if (set.find(item) == set.end()) {
-            set.insert(item);
+        if (!find(item)) {
+            fifo.push_back(item);
         }
     }
 
     void pop_front()
     {
-        assert(set.begin() != set.end());
-        set.erase(set.begin());
+        assert(!fifo.empty());
+        fifo.pop_front();
     }
 
     T front()
     {
-        return *(set.begin());
+        return fifo.front();
     }
 
     size_t size() {
-        return set.size();
+        return fifo.size();
     }
 
     bool empty() {
-        return (size() == 0);
+        return fifo.empty();
     }
 
     bool find(T item) {
-        return (set.find(item) != set.end());
+        // std::list<T>::iterator it = std::find(fifo.begin(), fifo.end(), item);
+        auto it = std::find(fifo.begin(), fifo.end(), item);
+        return (it != fifo.end());
     }
 
     void erase(T item) {
-        set.erase(item);
+        // std::list<T>::iterator it = std::find(fifo.begin(), fifo.end(), item);
+        auto it = std::find(fifo.begin(), fifo.end(), item);
+        assert(it != fifo.end());
+        fifo.erase(it);
     }
 };
 
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index c60d189e0f..a5d1d7e8e7 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -73,15 +73,15 @@ void
 BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt)
 {
     panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    DPRINTF(BaseMemoryEngine, "%s: Sending pakcet: %s to "
+                "the memory.\n", __func__, pkt->print());
     if (!sendTimingReq(pkt))
     {
         blockedPacket = pkt;
         _blocked = true;
-        DPRINTF(BaseMemoryEngine, "%s: MemPort blocked. blockedPacket %s.\n",
-                                            __func__, blockedPacket->print());
+        DPRINTF(BaseMemoryEngine, "%s: MemPort blocked.\n", __func__);
     } else {
-        DPRINTF(BaseMemoryEngine, "%s: Packet %s sent successfully.\n",
-                                                __func__, pkt->print());
+        DPRINTF(BaseMemoryEngine, "%s: Packet sent successfully.\n", __func__);
         owner->recvMemRetry();
     }
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 4d7107274b..6ed94fe938 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -46,14 +46,16 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr),
-    numRetriesReceived(0), applyQueue(numLines),
-    // writeBackQueue(numLines),
-    nextMemoryEvent([this] { processNextMemoryEvent(); }, name()),
-    // nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
-    nextRespondEvent([this] { processNextRespondEvent(); }, name()),
-    nextApplyEvent([this] { processNextApplyEvent(); }, name()),
-    // nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()),
-    // nextRecvPushRetryEvent([this] { processNextRecvPushRetryEvent(); }, name()),
+    numRetriesReceived(0),
+    nextMemoryEvent([this] {
+        processNextMemoryEvent();
+        }, name() + ".nextMemoryEvent"),
+    nextResponseEvent([this] {
+        processNextResponseEvent();
+        }, name() + ".nextResponseEvent"),
+    nextApplyEvent([this] {
+        processNextApplyEvent();
+        }, name() + ".nextApplyEvent"),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -79,8 +81,6 @@ CoalesceEngine::getBlockIndex(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
     Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
-    DPRINTF(CoalesceEngine, "%s: Trimming addr: %lu to %lu.\n",
-                                __func__, addr, trimmed_addr);
     return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
 }
 
@@ -108,21 +108,25 @@ bool
 CoalesceEngine::recvWLRead(Addr addr)
 {
     assert(MSHR.size() <= numMSHREntries);
-    DPRINTF(CoalesceEngine,  "%s: Received a read request for address: %lu.\n",
-                                                    __func__, addr);
-    Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
+
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
     assert(aligned_addr % peerMemoryAtomSize == 0);
-    // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int block_index = getBlockIndex(aligned_addr);
     assert(block_index < numLines);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
     assert(wl_offset < numElementsPerLine);
+    DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
+                        "This request maps to cacheBlocks[%d], aligned_addr: "
+                        "%lu, and wl_offset: %d.\n", __func__, addr,
+                        block_index, aligned_addr, wl_offset);
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
+        assert(cacheBlocks[block_index].allocated);
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
         // Hit
         // TODO: Add a hit latency as a param for this object.
-        // Can't just schedule the nextRespondEvent for latency cycles in
+        // Can't just schedule the nextResponseEvent for latency cycles in
         // the future.
         responseQueue.push_back(std::make_tuple(addr,
                     cacheBlocks[block_index].items[wl_offset]));
@@ -138,12 +142,12 @@ CoalesceEngine::recvWLRead(Addr addr)
                         cacheBlocks[block_index].items[wl_offset].to_string(),
                         responseQueue.size(),
                         peerWLEngine->getRegisterFileSize());
-        // TODO: Add a stat to count the number of WLItems that have been touched.
+        // TODO: Stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         stats.readHits++;
 
-        if (!nextRespondEvent.scheduled()) {
-            schedule(nextRespondEvent, nextCycle());
+        if (!nextResponseEvent.scheduled()) {
+            schedule(nextResponseEvent, nextCycle());
         }
         stats.numVertexReads++;
         return true;
@@ -151,44 +155,50 @@ CoalesceEngine::recvWLRead(Addr addr)
         // miss
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHR.find(block_index) == MSHR.end()) {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr: %lu not "
-                        "found in MSHRs.\n", __func__, block_index, addr);
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
+                    " %lu not found in MSHRs.\n", __func__, block_index, addr);
             assert(MSHR.size() <= numMSHREntries);
             if (MSHR.size() == numMSHREntries) {
                 // Out of MSHR entries
                 DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
-                            "Rejecting request.\n", __func__);
+                                "Rejecting request.\n", __func__);
                 // TODO: Break out read rejections into more than one stat
                 // based on the cause of the rejection
                 stats.readRejections++;
                 return false;
             } else {
-                DPRINTF(CoalesceEngine,  "%s: MSHR entries available.\n", __func__);
+                DPRINTF(CoalesceEngine,  "%s: MSHR "
+                    "entries available.\n", __func__);
                 if (cacheBlocks[block_index].allocated) {
                     assert(MSHR[block_index].size() <= numTgtsPerMSHR);
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                                 "with Addr: %lu.\n", __func__, addr,
                                 cacheBlocks[block_index].addr);
                     if (MSHR[block_index].size() == numTgtsPerMSHR) {
-                        DPRINTF(CoalesceEngine,  "%s: Out of targets for cacheBlocks[%d]. "
-                                    "Rejecting request.\n",
+                        DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+                                    "cacheBlocks[%d]. Rejecting request.\n",
                                     __func__, block_index);
                         stats.readRejections++;
                         return false;
                     }
                     cacheBlocks[block_index].hasConflict = true;
                     MSHR[block_index].push_back(addr);
-                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
-                                "line[%d].\n", __func__, addr, block_index);
+                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
                     stats.numVertexReads++;
-
                     if ((cacheBlocks[block_index].busyMask == 0) &&
                         (cacheBlocks[block_index].valid)) {
-                        applyQueue.push_back(block_index);
-                        DPRINTF(CoalesceEngine,  "%s: Added %d to applyQueue. "
-                                    "applyQueue.size = %u.\n", __func__,
-                                    block_index, applyQueue.size());
+                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not "
+                                            "busy. It %s in the applyQueue.\n",
+                                            __func__, block_index,
+                            applyQueue.find(block_index) ? "is" : "is not");
+                        if (!applyQueue.find(block_index)) {
+                            applyQueue.push_back(block_index);
+                            DPRINTF(CoalesceEngine,  "%s: Added %d to "
+                                        "applyQueue. applyQueue.size = %u.\n",
+                                    __func__, block_index, applyQueue.size());
+                        }
                         assert(!applyQueue.empty());
                         if ((!nextApplyEvent.scheduled())) {
                             schedule(nextApplyEvent, nextCycle());
@@ -208,24 +218,18 @@ CoalesceEngine::recvWLRead(Addr addr)
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
-                    DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for"
+                    DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
                                 " Addr: %lu.\n", __func__, block_index, addr);
-
                     MSHR[block_index].push_back(addr);
-                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                    DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
-
-                    // fillQueue.push_back(block_index);
-                    // assert(fillQueue.size() <= numLines);
-                    memoryFunctionQueue.emplace_back([this] (int block_index) { processNextMemoryReadEvent(block_index); }, block_index);
-                    // FIXME: Fix this DPRINTF
-                    // DPRINTF(CoalesceEngine,  "%s: Pushed pkt index  "
-                    //         "lineFillBuffer. lineFillBuffer.size = %d.\n",
-                    //         __func__, fillQueue.size());
-                    // if ((!nextMemoryReadEvent.pending()) &&
-                    //     (!nextMemoryReadEvent.scheduled())) {
-                    //     schedule(nextMemoryReadEvent, nextCycle());
-                    // }
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index) {
+                            processNextRead(block_index);
+                        }, block_index);
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
+                                        "input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
                     if ((!nextMemoryEvent.pending()) &&
                         (!nextMemoryEvent.scheduled())) {
                         schedule(nextMemoryEvent, nextCycle());
@@ -236,21 +240,23 @@ CoalesceEngine::recvWLRead(Addr addr)
                 }
             }
         } else {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr: %lu already "
-                        "in MSHRs.\n", __func__, block_index, addr);
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
+                "Addr: %lu already in MSHRs.\n", __func__, block_index, addr);
             if (MSHR[block_index].size() == numTgtsPerMSHR) {
-                DPRINTF(CoalesceEngine,  "%s: Out of targets for cacheBlocks[%d]. "
-                            "Rejecting request.\n",
-                            __func__, block_index);
+                DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+                            "cacheBlocks[%d]. Rejecting request.\n",
+                                            __func__, block_index);
                 stats.readRejections++;
                 return false;
             }
-            if ((!cacheBlocks[block_index].hasConflict) &&
-                (aligned_addr != cacheBlocks[block_index].addr)) {
+            if ((aligned_addr != cacheBlocks[block_index].addr)) {
                 DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                             "with Addr: %lu.\n", __func__, addr,
                             cacheBlocks[block_index].addr);
                 cacheBlocks[block_index].hasConflict = true;
+            } else {
+                DPRINTF(CoalesceEngine, "%s: There is room for another target "
+                            "for cacheBlocks[%d].\n", __func__, block_index);
             }
 
             if (aligned_addr != cacheBlocks[block_index].addr) {
@@ -260,295 +266,88 @@ CoalesceEngine::recvWLRead(Addr addr)
             }
 
             MSHR[block_index].push_back(addr);
-            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
-                            "line[%d].\n", __func__, addr, block_index);
+            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
+                            "cacheBlocks[%d].\n", __func__, addr, block_index);
             stats.numVertexReads++;
             return true;
         }
     }
 }
 
-void
-CoalesceEngine::processNextMemoryReadEvent(int block_index)
-{
-    // assert(!nextMemoryReadEvent.pending());
-    // if (memPort.blocked()) {
-    //     // TODO: Implement interface where events of the CoalesceEngine are
-    //     // pushed to a fifo to be scheduled later.
-    //     nextMemoryReadEvent.sleep();
-    //     pendingEventQueue.push_back("nextMemoryReadEvent");
-    //     // Maximum three MemoryEvents.
-    //     assert(pendingEventQueue.size() <= 3);
-    //     DPRINTF(CoalesceEngine, "%s: nextMemoryReadEvent is asleep now and "
-    //                                 "has been pushed to pendingEventQueue. "
-    //                                 "pendingEventQueue.size = %d.\n",
-    //                                 __func__, pendingEventQueue.size());
-    //     return;
-    // }
-
-    // int block_index = fillQueue.front();
-    PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
-                                    peerMemoryAtomSize);
-    DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
-            "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
-
-    memPort.sendPacket(pkt);
-
-    // fillQueue.pop_front();
-
-    // if (!fillQueue.empty()) {
-    //     memoryFunctionQueue.push_back([this] { processNextMemoryReadEvent(); });
-    // }
-}
-
-// TODO: For loop to empty the entire responseQueue.
-void
-CoalesceEngine::processNextRespondEvent()
-{
-    Addr addr_response;
-    WorkListItem worklist_response;
-
-    std::tie(addr_response, worklist_response) = responseQueue.front();
-    peerWLEngine->handleIncomingWL(addr_response, worklist_response);
-    DPRINTF(CoalesceEngine,  "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n",
-                __func__, worklist_response.to_string(), addr_response);
-
-    responseQueue.pop_front();
-    DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
-                responseQueue.size(), peerWLEngine->getRegisterFileSize());
-    DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
-                responseQueue.size(), peerWLEngine->getRegisterFileSize());
-
-    if ((!nextRespondEvent.scheduled()) &&
-        (!responseQueue.empty())) {
-        schedule(nextRespondEvent, nextCycle());
-    }
-}
-
-void
-CoalesceEngine::processNextMemoryEvent()
-{
-    if (memPort.blocked()) {
-        nextMemoryEvent.sleep();
-        return;
-    }
-
-    std::function<void(int)> next_memory_function;
-    int next_memory_function_input;
-    std::tie(next_memory_function, next_memory_function_input) = memoryFunctionQueue.front();
-    next_memory_function(next_memory_function_input);
-    memoryFunctionQueue.pop_front();
-    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
-                                "memoryFunctionQueue.size = %d.\n", __func__,
-                                memoryFunctionQueue.size());
-
-    assert(!nextMemoryEvent.pending());
-    assert(!nextMemoryEvent.scheduled());
-    if ((!memoryFunctionQueue.empty())) {
-        schedule(nextMemoryEvent, nextCycle());
-    }
-}
-
-void
-CoalesceEngine::recvMemRetry()
-{
-    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
-    // if (pendingEventQueue.empty()) {
-    //     DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
-    //     return;
-    // }
-
-    // std::string front = pendingEventQueue.front();
-    // DPRINTF(CoalesceEngine, "%s: %s is pending MemRetry.\n", __func__, front);
-
-    // if (front == "nextMemoryReadEvent") {
-    //     assert(!nextMemoryReadEvent.scheduled());
-    //     assert(nextMemoryReadEvent.pending());
-    //     schedule(nextMemoryReadEvent, nextCycle());
-    //     nextMemoryReadEvent.wake();
-    // } else if (front == "nextWriteBackEvent") {
-    //     assert(!nextWriteBackEvent.scheduled());
-    //     assert(nextWriteBackEvent.pending());
-    //     schedule(nextWriteBackEvent, nextCycle());
-    //     nextWriteBackEvent.wake();
-    // } else if (front == "nextRecvPushRetryEvent") {
-    //     assert(!nextRecvPushRetryEvent.scheduled());
-    //     assert(nextRecvPushRetryEvent.pending());
-    //     schedule(nextRecvPushRetryEvent, nextCycle());
-    //     nextRecvPushRetryEvent.wake();
-    // } else {
-    //     panic("EVENT IS NOT RECOGNIZED.\n");
-    // }
-
-    // pendingEventQueue.pop_front();
-    // return;
-
-    if (!nextMemoryEvent.pending()) {
-        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
-        return;
-    }
-    assert(!nextMemoryEvent.scheduled());
-    nextMemoryEvent.wake();
-    schedule(nextMemoryEvent, nextCycle());
-}
-
-// FIXME: Fix this function.
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
     assert(pkt->isResponse());
+    DPRINTF(CoalesceEngine,  "%s: Received packet: %s from memory.\n",
+                                                __func__, pkt->print());
     if (pkt->isWrite()) {
+        DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__);
         delete pkt;
-        DPRINTF(CoalesceEngine,  "%s: Received a write response for Addr: %lu. Dropping "
-                    "the packet.\n", __func__, pkt->getAddr());
         return true;
     }
 
+    Addr addr = pkt->getAddr();
+    int block_index = getBlockIndex(addr);
+
     if (pkt->findNextSenderState<SenderState>()) {
-        Addr addr = pkt->getAddr();
+        assert(!((cacheBlocks[block_index].addr == addr) &&
+                (cacheBlocks[block_index].valid)));
+        // We have read the address to send the wl and it is not in the
+        // cache. Simply send the items to the PushEngine.
         int it = getBitIndexBase(addr);
-        int block_index = getBlockIndex(addr);
-
-        if ((cacheBlocks[block_index].addr == addr) &&
-            (cacheBlocks[block_index].valid)) {
-            // We read the address to send the wl but it is put in cache before
-            // the read response arrives.
-            if (cacheBlocks[block_index].busyMask == 0) {
-                DPRINTF(CoalesceEngine, "%s: Received read response for retry "
-                        "for addr %lu. It was found in the cache as idle.\n",
-                        __func__, addr);
-                int push_needed = 0;
-                // It is not busy anymore, we have to send the wl from cache.
-                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-                assert(peerPushEngine->getNumRetries() == needsPush.count());
-                for (int i = 0; i < numElementsPerLine; i++) {
-                    assert(!((needsPush[it + i] == 1) &&
-                            (cacheBlocks[block_index].items[i].degree == 0)));
-                    // TODO: Make this more programmable
-                    uint32_t new_prop = std::min(
-                                        cacheBlocks[block_index].items[i].prop,
-                                        cacheBlocks[block_index].items[i].tempProp);
-                    cacheBlocks[block_index].items[i].tempProp = new_prop;
-                    cacheBlocks[block_index].items[i].prop = new_prop;
-                    if (needsPush[it + i] == 1) {
-                        peerPushEngine->recvWLItemRetry(
-                            cacheBlocks[block_index].items[i]);
-                    }
-                    push_needed += needsPush[it + i];
-                    needsPush[it + i] = 0;
-                }
-                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-                peerPushEngine->deallocatePushSpace(
-                                        numElementsPerLine - push_needed);
-                assert(peerPushEngine->getNumRetries() == needsPush.count());
-                // Since we have just applied the line, we can take it out of
-                // the applyQueue if it's in there. No need to do the same
-                // thing for evictQueue.
-                if (applyQueue.find(block_index)) {
-                    applyQueue.erase(block_index);
-                    if (applyQueue.empty() && nextApplyEvent.scheduled()) {
-                        deschedule(nextApplyEvent);
-                    }
-                    if (cacheBlocks[block_index].hasConflict) {
-                        // writeBackQueue.push_back(block_index);
-                        // assert(writeBackQueue.size() <= numLines);
-                        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
-                        // if ((!nextWriteBackEvent.pending()) &&
-                        //     (!nextWriteBackEvent.scheduled())) {
-                        //     schedule(nextWriteBackEvent, nextCycle());
-                        // }
-                        // if ((!nextMemoryEvent.pending()) &&
-                        //     (!nextMemoryEvent.scheduled())) {
-                        //     schedule(nextMemoryEvent, nextCycle());
-                        // }
-                    }
-                }
-            } else {
-                // The line is busy. Therefore, we have to disregard the data
-                // we received from the memory and also tell the push engine to
-                // deallocate the space it allocated for this retry. However,
-                // we still have to rememeber that these items need a retry.
-                // i.e. don't change needsPush, call recvWLItemRetry with
-                // do_push = false
-                DPRINTF(CoalesceEngine, "%s: Received read response for retry "
-                        "for addr %lu. It was found in the cache as busy.\n",
-                        __func__, addr);
-                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-                assert(peerPushEngine->getNumRetries() == needsPush.count());
-                peerPushEngine->deallocatePushSpace(numElementsPerLine);
-                assert(peerPushEngine->getNumRetries() == needsPush.count());
-                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-            }
-        } else {
-            // We have read the address to send the wl and it is not in the
-            // cache. Simply send the items to the PushEngine.
-            DPRINTF(CoalesceEngine, "%s: Received read response for retry "
-                        "for addr %lu. It was not found in the cache.\n",
-                        __func__, addr);
-            WorkListItem* items = pkt->getPtr<WorkListItem>();
-            int push_needed = 0;
-            // No applying of the line needed.
-            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-            assert(peerPushEngine->getNumRetries() == needsPush.count());
-            for (int i = 0; i < numElementsPerLine; i++) {
-                assert(!((needsPush[it + i] == 1) &&
-                                (items[i].degree == 0)));
-                if (needsPush[it + i] == 1) {
-                    peerPushEngine->recvWLItemRetry(items[i]);
-                }
-                push_needed += needsPush[it + i];
-                needsPush[it + i] = 0;
+        DPRINTF(CoalesceEngine, "%s: Received read response for retry "
+                    "for addr %lu. It was not found in the cache.\n",
+                    __func__, addr);
+        WorkListItem* items = pkt->getPtr<WorkListItem>();
+        int push_needed = 0;
+        // No applying of the line needed.
+        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                            __func__, needsPush.count());
+        assert(peerPushEngine->getNumRetries() == needsPush.count());
+        for (int i = 0; i < numElementsPerLine; i++) {
+            assert(!((needsPush[it + i] == 1) &&
+                            (items[i].degree == 0)));
+            if (needsPush[it + i] == 1) {
+                peerPushEngine->recvWLItemRetry(items[i]);
             }
-            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-            peerPushEngine->deallocatePushSpace(
-                                    numElementsPerLine - push_needed);
-            assert(peerPushEngine->getNumRetries() == needsPush.count());
+            push_needed += needsPush[it + i];
+            needsPush[it + i] = 0;
         }
-
+        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                            __func__, needsPush.count());
+        peerPushEngine->deallocatePushSpace(
+                                numElementsPerLine - push_needed);
+        assert(peerPushEngine->getNumRetries() == needsPush.count());
+        // }
         delete pkt;
         return true;
     }
 
-    Addr addr = pkt->getAddr();
-    // int block_index = (addr / peerMemoryAtomSize) % numLines;
-    int block_index = getBlockIndex(addr);
-
-    DPRINTF(CoalesceEngine,  "%s: Received a read resposne for Addr: %lu.\n",
-                __func__, pkt->getAddr());
-    // assert((cacheBlocks[block_index].allocated) && // allocated cache block
-    //         (!cacheBlocks[block_index].valid) &&    // valid is false
-    //         (!(MSHR.find(block_index) == MSHR.end()))); // allocated MSHR
-    assert(cacheBlocks[block_index].allocated);
-    assert(!cacheBlocks[block_index].valid);
-    assert(MSHR.find(block_index) != MSHR.end());
-    pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
+    if (cacheBlocks[block_index].addr == addr) {
+        assert(cacheBlocks[block_index].allocated);
+        assert(!cacheBlocks[block_index].valid);
+        assert(MSHR.find(block_index) != MSHR.end());
+        pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
                                                 peerMemoryAtomSize);
-
-    for (int i = 0; i < numElementsPerLine; i++) {
-        DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
-                block_index, i, cacheBlocks[block_index].items[i].to_string());
+        for (int i = 0; i < numElementsPerLine; i++) {
+        DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
+                            __func__, block_index, i,
+                            cacheBlocks[block_index].items[i].to_string());
+        }
+        cacheBlocks[block_index].valid = true;
+        delete pkt;
     }
-    cacheBlocks[block_index].valid = true;
-    delete pkt;
 
     // FIXME: Get rid of servicedIndices (maybe use an iterator)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHR[block_index].size(); i++) {
         Addr miss_addr = MSHR[block_index][i];
-        Addr aligned_miss_addr = roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
+        Addr aligned_miss_addr = roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for cacheBlocks[%d] could "
-                        "be serviced with the received packet.\n",
-                        __func__, miss_addr, block_index);
+            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                        "cacheBlocks[%d] can be serviced with the received "
+                        "packet.\n",__func__, miss_addr, block_index);
             // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
@@ -567,10 +366,9 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
             // End of the said block
-
             servicedIndices.push_back(i);
-            DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cacheBlocks[%d] for "
-                        "removal.\n", __func__, i, block_index);
+            // DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cacheBlocks[%d] for "
+            //             "removal.\n", __func__, i, block_index);
         }
     }
 
@@ -593,19 +391,46 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         assert(cacheBlocks[block_index].hasConflict);
     }
 
-    if ((!nextRespondEvent.scheduled()) &&
+    if ((!nextResponseEvent.scheduled()) &&
         (!responseQueue.empty())) {
-        schedule(nextRespondEvent, nextCycle());
+        schedule(nextResponseEvent, nextCycle());
     }
 
     return true;
 }
 
+// TODO: For loop to empty the entire responseQueue.
+void
+CoalesceEngine::processNextResponseEvent()
+{
+    Addr addr_response;
+    WorkListItem worklist_response;
+
+    std::tie(addr_response, worklist_response) = responseQueue.front();
+    peerWLEngine->handleIncomingWL(addr_response, worklist_response);
+    DPRINTF(CoalesceEngine,
+                "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
+                __func__, worklist_response.to_string(), addr_response);
+
+    responseQueue.pop_front();
+    DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
+                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
+                responseQueue.size(), peerWLEngine->getRegisterFileSize());
+    DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
+                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
+                responseQueue.size(), peerWLEngine->getRegisterFileSize());
+
+    if ((!nextResponseEvent.scheduled()) &&
+        (!responseQueue.empty())) {
+        schedule(nextResponseEvent, nextCycle());
+    }
+}
+
 void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
     // TODO: Parameterize all the numbers here.
-    Addr aligned_addr = roundDown<Addr, Addr>(addr, peerMemoryAtomSize);
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
     // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int block_index = getBlockIndex(aligned_addr);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
@@ -691,12 +516,11 @@ CoalesceEngine::processNextApplyEvent()
     // TODO: This is where eviction policy goes
     if ((cacheBlocks[block_index].hasConflict) &&
         (cacheBlocks[block_index].busyMask == 0)) {
-        // writeBackQueue.push_back(block_index);
-        // assert(writeBackQueue.size() <= numLines);
-        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
-        // DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. "
-        //                     "writeBackQueue.size = %u.\n", __func__,
-        //                         block_index, writeBackQueue.size());
+        memoryFunctionQueue.emplace_back([this] (int block_index) {
+                processNextWriteBack(block_index);
+            }, block_index);
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input %d "
+                        "to memoryFunctionQueue.\n", __func__, block_index);
         if ((!nextMemoryEvent.pending()) &&
             (!nextMemoryEvent.scheduled())) {
             schedule(nextMemoryEvent, nextCycle());
@@ -711,23 +535,47 @@ CoalesceEngine::processNextApplyEvent()
 }
 
 void
-CoalesceEngine::processNextWriteBackEvent(int block_index)
+CoalesceEngine::processNextMemoryEvent()
 {
-    // assert(!nextWriteBackEvent.pending());
-    // if (memPort.blocked()) {
-    //     nextWriteBackEvent.sleep();
-    //     pendingEventQueue.push_back("nextWriteBackEvent");
-    //     // Maximum three MemoryEvent.
-    //     assert(pendingEventQueue.size() <= 3);
-    //     DPRINTF(CoalesceEngine, "%s: nextWriteBackEvent is asleep now and "
-    //                                 "has been pushed to pendingEventQueue. "
-    //                                 "pendingEventQueue.size = %d.\n",
-    //                                 __func__, pendingEventQueue.size());
-    //     return;
-    // }
-
-    // int block_index = writeBackQueue.front();
+    if (memPort.blocked()) {
+        nextMemoryEvent.sleep();
+        return;
+    }
 
+    DPRINTF(CoalesceEngine, "%s: Processing another "
+                        "memory function.\n", __func__);
+    std::function<void(int)> next_memory_function;
+    int next_memory_function_input;
+    std::tie(
+        next_memory_function,
+        next_memory_function_input) = memoryFunctionQueue.front();
+    next_memory_function(next_memory_function_input);
+    memoryFunctionQueue.pop_front();
+    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
+                                "memoryFunctionQueue.size = %d.\n", __func__,
+                                memoryFunctionQueue.size());
+
+    assert(!nextMemoryEvent.pending());
+    assert(!nextMemoryEvent.scheduled());
+    if ((!memoryFunctionQueue.empty())) {
+        schedule(nextMemoryEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextRead(int block_index)
+{
+    PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
+                                    peerMemoryAtomSize);
+    DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
+            "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
+
+    memPort.sendPacket(pkt);
+}
+
+void
+CoalesceEngine::processNextWriteBack(int block_index)
+{
     // Why would we write it back if it does not have a conflict?
     assert(cacheBlocks[block_index].hasConflict);
 
@@ -749,6 +597,10 @@ CoalesceEngine::processNextWriteBackEvent(int block_index)
                         "Addr: %lu, size = %d.\n", __func__,
                         write_pkt->getAddr(), write_pkt->getSize());
             memPort.sendPacket(write_pkt);
+        } else {
+            DPRINTF(CoalesceEngine, "%s: No change observed on "
+                            "cacheBlocks[%d]. No write back needed.\n",
+                                            __func__, block_index);
         }
         assert(!MSHR[block_index].empty());
         Addr miss_addr = MSHR[block_index].front();
@@ -756,7 +608,7 @@ CoalesceEngine::processNextWriteBackEvent(int block_index)
                                     "cacheBlocks[%d] is Addr: %lu.\n",
                                     __func__, block_index, miss_addr);
         Addr aligned_miss_addr =
-            roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
+            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
 
         cacheBlocks[block_index].addr = aligned_miss_addr;
         cacheBlocks[block_index].busyMask = 0;
@@ -766,53 +618,12 @@ CoalesceEngine::processNextWriteBackEvent(int block_index)
         cacheBlocks[block_index].dirty = false;
         DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for "
                 "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
-        // fillQueue.push_back(block_index);
-        // assert(fillQueue.size() <= numLines);
-        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextMemoryReadEvent(block_index); }, block_index);
-        // if ((!nextMemoryReadEvent.pending()) &&
-        //     (!nextMemoryReadEvent.scheduled())){
-        //     schedule(nextMemoryReadEvent, nextCycle());
-        // }
-        // if ((!nextMemoryEvent.pending()) &&
-        //     (!nextMemoryEvent.scheduled())) {
-        //     schedule(nextMemoryEvent, nextCycle());
-        // }
-    }
-
-    // writeBackQueue.pop_front();
-    // assert(writeBackQueue.size() <= numLines);
-    // DPRINTF(CoalesceEngine, "%s: Popped %d from writeBackQueue. "
-    //                 "writeBackQueue.size = %d, writeBackQueueSize = %d.\n",
-    //                 __func__, block_index, writeBackQueue.size(), numLines);
-
-    // if (!writeBackQueue.empty()) {
-        // assert(!nextWriteBackEvent.pending());
-        // assert(!nextWriteBackEvent.scheduled());
-        // schedule(nextWriteBackEvent, nextCycle());
-        // memoryFunctionQueue.push_back([this] { processNextWriteBackEvent(); });
-        // if ((!nextMemoryEvent.pending()) &&
-        //     (!nextMemoryEvent.scheduled())) {
-        //     schedule(nextMemoryEvent, nextCycle());
-        // }
-    // }
-}
 
-void
-CoalesceEngine::recvPushRetry()
-{
-    numRetriesReceived++;
-    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
-    // For now since we do only one retry at a time, we should not receive
-    // a retry while this nextSendingRetryEvent is scheduled or is pending.
-    // assert(!nextRecvPushRetryEvent.pending());
-    // assert(!nextRecvPushRetryEvent.scheduled());
-    assert(numRetriesReceived == 1);
-    // schedule(nextRecvPushRetryEvent, nextCycle());
-    // TODO: Pass slice_base to getOptimalBitVectorSlice
-    memoryFunctionQueue.emplace_back([this] (int slice_base) { processNextRecvPushRetryEvent(slice_base); }, 0);
-    if ((!nextMemoryEvent.pending()) &&
-        (!nextMemoryEvent.scheduled())) {
-        schedule(nextMemoryEvent, nextCycle());
+        memoryFunctionQueue.emplace_back([this] (int block_index) {
+                processNextRead(block_index);
+            }, block_index);
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input %d to "
+                            "memoryFunctionQueue.\n", __func__, block_index);
     }
 }
 
@@ -866,7 +677,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
 }
 
 void
-CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2)
+CoalesceEngine::processNextPushRetry(int slice_base_2)
 {
     bool hit_in_cache;
     int slice_base;
@@ -879,14 +690,6 @@ CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2)
             assert(cacheBlocks[block_index].valid);
             assert(cacheBlocks[block_index].busyMask == 0);
 
-            // if nextRecvPushRetryEvent has been blocked by memory before
-            // if (nextRecvPushRetryEvent.getPrevState() == -1) {
-            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetry passing "
-            //                             "its MemRetry.\n", __func__);
-            //     recvMemRetry();
-            //     nextRecvPushRetryEvent.setPrevState(0);
-            // }
-
             int push_needed = 0;
             DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                     __func__, needsPush.count());
@@ -916,39 +719,15 @@ CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2)
                     deschedule(nextApplyEvent);
                 }
                 if (cacheBlocks[block_index].hasConflict) {
-                    // writeBackQueue.push_back(block_index);
-                    // assert(writeBackQueue.size() <= numLines);
-                    // if ((!nextWriteBackEvent.pending()) &&
-                    //     (!nextWriteBackEvent.scheduled())) {
-                    //     schedule(nextWriteBackEvent, nextCycle());
-                    // }
-                    memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
-                    // if ((!nextMemoryEvent.pending()) &&
-                    //     (!nextMemoryEvent.scheduled())) {
-                    //     schedule(nextMemoryEvent, nextCycle());
-                    // }
+                    memoryFunctionQueue.emplace_back([this] (int block_index) {
+                        processNextWriteBack(block_index);
+                    }, block_index);
+                    DPRINTF(CoalesceEngine, "%s: Pushed nextWriteBackEvent for"
+                                        " input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
                 }
             }
         } else {
-            // if (memPort.blocked()) {
-            //     // assert(nextRecvPushRetryEvent.getPrevState() != -1);
-            //     nextRecvPushRetryEvent.setPrevState(-1);
-            //     nextRecvPushRetryEvent.sleep();
-            //     pendingEventQueue.push_back("nextRecvPushRetryEvent");
-            //     assert(pendingEventQueue.size() <= 3);
-            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is asleep now "
-            //                             "and has been pushed to pendingEventQueue."
-            //                             " pendingEventQueue.size = %d.\n",
-            //                             __func__, pendingEventQueue.size());
-            //     return;
-            // }
-            // if nextRecvPushRetryEvent has been blocked by memory before
-            // if (nextRecvPushRetryEvent.getPrevState() == -1) {
-            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is "
-            //         "unblocked by memPort. Setting prevState to 0.\n", __func__);
-            //     nextRecvPushRetryEvent.setPrevState(0);
-            // }
-
             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
             SenderState* sender_state = new SenderState(true);
             pkt->pushSenderState(sender_state);
@@ -961,18 +740,53 @@ CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2)
         numRetriesReceived--;
         assert(numRetriesReceived == 0);
     }
-    // if (numRetriesReceived > 0) {
-    //     schedule(nextRecvPushRetryEvent, nextCycle());
-    // }
+
     if (numRetriesReceived > 0) {
-        memoryFunctionQueue.emplace_back([this] (int slice_base) { processNextRecvPushRetryEvent(slice_base); }, 0);
-        // if ((!nextMemoryEvent.pending()) &&
-        //     (!nextMemoryEvent.scheduled())) {
-        //     schedule(nextMemoryEvent, nextCycle());
-        // }
+        memoryFunctionQueue.emplace_back([this] (int slice_base) {
+            processNextPushRetry(slice_base);
+        }, 0);
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input "
+                                    "0 to memoryFunctionQueue.\n", __func__);
+    }
+}
+
+void
+CoalesceEngine::recvMemRetry()
+{
+    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
+
+    if (!nextMemoryEvent.pending()) {
+        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
+        return;
+    }
+    assert(!nextMemoryEvent.scheduled());
+    nextMemoryEvent.wake();
+    schedule(nextMemoryEvent, nextCycle());
+}
+
+void
+CoalesceEngine::recvPushRetry()
+{
+    numRetriesReceived++;
+    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
+    // For now since we do only one retry at a time, we should not receive
+    // a retry while this nextSendingRetryEvent is scheduled or is pending.
+    assert(numRetriesReceived == 1);
+
+    // TODO: Pass slice_base to getOptimalBitVectorSlice
+    memoryFunctionQueue.emplace_back([this] (int slice_base) {
+        processNextPushRetry(slice_base);
+    }, 0);
+    DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input 0 to "
+                                        "memoryFunctionQueue.\n", __func__);
+    if ((!nextMemoryEvent.pending()) &&
+        (!nextMemoryEvent.scheduled())) {
+        schedule(nextMemoryEvent, nextCycle());
     }
 }
 
+
+
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 4036dc49af..7db09cec11 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -77,58 +77,40 @@ class CoalesceEngine : public BaseMemoryEngine
       SenderState(bool is_retry): isRetry(is_retry) {}
     };
 
-    // int nmpu;
-    // Addr memoryAddressOffset;
-
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
 
-    Block* cacheBlocks;
-
     int numLines;
     int numElementsPerLine;
+    Block* cacheBlocks;
 
     int numMSHREntries;
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHR;
-
-    // std::deque<int> fillQueue;
-
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     int numRetriesReceived;
-    InOutSet<int> applyQueue;
+    UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
-    // InOutSet<int> writeBackQueue;
-
-
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
     std::tuple<bool, int> getOptimalBitVectorSlice();
 
-    // std::deque<std::string> pendingEventQueue;
-
-    std::deque<std::tuple<std::function<void(int)>, int>> memoryFunctionQueue;
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
+    void processNextRead(int block_index);
+    void processNextWriteBack(int block_index);
+    void processNextPushRetry(int slice_base);
+    std::deque<std::tuple<std::function<void(int)>, int>> memoryFunctionQueue;
 
-    // MemoryEvent nextMemoryReadEvent;
-    void processNextMemoryReadEvent(int block_index);
-
-    EventFunctionWrapper nextRespondEvent;
-    void processNextRespondEvent();
+    EventFunctionWrapper nextResponseEvent;
+    void processNextResponseEvent();
 
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
-    // MemoryEvent nextWriteBackEvent;
-    void processNextWriteBackEvent(int block_index);
-
-    // MemoryEvent nextRecvPushRetryEvent;
-    void processNextRecvPushRetryEvent(int slice_base);
-
     struct CoalesceStats : public statistics::Group
     {
       CoalesceStats(CoalesceEngine &coalesce);
@@ -164,8 +146,6 @@ class CoalesceEngine : public BaseMemoryEngine
     void registerWLEngine(WLEngine* wl_engine);
 
     void recvPushRetry();
-
-    // virtual void startup() override;
 };
 
 }

From 08ca0a193d0d22ef85cf5a95691a0317ff14c276 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 29 Jul 2022 16:59:30 -0700
Subject: [PATCH 137/247] Fixing cache block state machine. wip.

---
 src/accl/graph/sega/SConscript         |   1 +
 src/accl/graph/sega/coalesce_engine.cc | 385 ++++++++++++++++++++++---
 src/accl/graph/sega/coalesce_engine.hh |  31 +-
 src/accl/graph/sega/state_machine.md   |   1 +
 4 files changed, 368 insertions(+), 50 deletions(-)
 create mode 100644 src/accl/graph/sega/state_machine.md

diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 97a62d44a0..81a29df6af 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -43,5 +43,6 @@ DebugFlag('BaseMemoryEngine')
 DebugFlag('ApplyUpdates')
 DebugFlag('CenteralController')
 DebugFlag('CoalesceEngine')
+DebugFlag('CacheBlockState')
 DebugFlag('PushEngine')
 DebugFlag('WLEngine')
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 6ed94fe938..a0c85de2f5 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -34,6 +34,7 @@
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
 #include "debug/CoalesceEngine.hh"
+#include "debug/CacheBlockState.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
 
@@ -104,11 +105,180 @@ CoalesceEngine::getBlockAddrFromBitIndex(int index)
     return peerMemoryRange.addIntlvBits(trimmed_addr);
 }
 
+// TODO: Prev implementaton of recvWLRead. Remove
+// bool
+// CoalesceEngine::recvWLRead(Addr addr)
+// {
+//     assert(MSHR.size() <= numMSHREntries);
+
+//     Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+//     assert(aligned_addr % peerMemoryAtomSize == 0);
+//     int block_index = getBlockIndex(aligned_addr);
+//     assert(block_index < numLines);
+//     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+//     assert(wl_offset < numElementsPerLine);
+//     DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
+//                         "This request maps to cacheBlocks[%d], aligned_addr: "
+//                         "%lu, and wl_offset: %d.\n", __func__, addr,
+//                         block_index, aligned_addr, wl_offset);
+
+//     if ((cacheBlocks[block_index].addr == aligned_addr) &&
+//         (cacheBlocks[block_index].valid)) {
+//         assert(cacheBlocks[block_index].allocated);
+//         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
+//         // Hit
+//         // TODO: Add a hit latency as a param for this object.
+//         // Can't just schedule the nextResponseEvent for latency cycles in
+//         // the future.
+//         responseQueue.push_back(std::make_tuple(addr,
+//                     cacheBlocks[block_index].items[wl_offset]));
+//         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+//                         "to responseQueue. responseQueue.size = %d, "
+//                         "responseQueueSize = %d.\n", __func__, addr,
+//                         cacheBlocks[block_index].items[wl_offset].to_string(),
+//                         responseQueue.size(),
+//                         peerWLEngine->getRegisterFileSize());
+//         DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+//                         "to responseQueue. responseQueue.size = %d, "
+//                         "responseQueueSize = %d.\n", __func__, addr,
+//                         cacheBlocks[block_index].items[wl_offset].to_string(),
+//                         responseQueue.size(),
+//                         peerWLEngine->getRegisterFileSize());
+//         // TODO: Stat to count the number of WLItems that have been touched.
+//         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+//         stats.readHits++;
+
+//         if (!nextResponseEvent.scheduled()) {
+//             schedule(nextResponseEvent, nextCycle());
+//         }
+//         stats.numVertexReads++;
+//         return true;
+//     } else {
+//         // miss
+//         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
+//         if (MSHR.find(block_index) == MSHR.end()) {
+//             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
+//                     " %lu not found in MSHRs.\n", __func__, block_index, addr);
+//             assert(MSHR.size() <= numMSHREntries);
+//             if (MSHR.size() == numMSHREntries) {
+//                 // Out of MSHR entries
+//                 DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
+//                                 "Rejecting request.\n", __func__);
+//                 // TODO: Break out read rejections into more than one stat
+//                 // based on the cause of the rejection
+//                 stats.readRejections++;
+//                 return false;
+//             } else {
+//                 DPRINTF(CoalesceEngine,  "%s: MSHR "
+//                     "entries available.\n", __func__);
+//                 if (cacheBlocks[block_index].allocated) {
+//                     assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+//                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
+//                                 "with Addr: %lu.\n", __func__, addr,
+//                                 cacheBlocks[block_index].addr);
+//                     if (MSHR[block_index].size() == numTgtsPerMSHR) {
+//                         DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+//                                     "cacheBlocks[%d]. Rejecting request.\n",
+//                                     __func__, block_index);
+//                         stats.readRejections++;
+//                         return false;
+//                     }
+//                     cacheBlocks[block_index].hasConflict = true;
+//                     MSHR[block_index].push_back(addr);
+//                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+//                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
+//                     stats.readMisses++;
+//                     stats.numVertexReads++;
+//                     if ((cacheBlocks[block_index].busyMask == 0) &&
+//                         (cacheBlocks[block_index].valid)) {
+//                         DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not "
+//                                             "busy. It %s in the applyQueue.\n",
+//                                             __func__, block_index,
+//                             applyQueue.find(block_index) ? "is" : "is not");
+//                         if (!applyQueue.find(block_index)) {
+//                             applyQueue.push_back(block_index);
+//                             DPRINTF(CoalesceEngine,  "%s: Added %d to "
+//                                         "applyQueue. applyQueue.size = %u.\n",
+//                                     __func__, block_index, applyQueue.size());
+//                         }
+//                         assert(!applyQueue.empty());
+//                         if ((!nextApplyEvent.scheduled())) {
+//                             schedule(nextApplyEvent, nextCycle());
+//                         }
+//                     }
+//                     return true;
+//                 } else {
+//                     assert(!cacheBlocks[block_index].valid);
+//                     assert(MSHR[block_index].size() == 0);
+//                     // MSHR available and no conflict
+//                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
+//                                             "Allocating a cache line for it.\n"
+//                                                             , __func__, addr);
+
+//                     cacheBlocks[block_index].addr = aligned_addr;
+//                     cacheBlocks[block_index].busyMask = 0;
+//                     cacheBlocks[block_index].allocated = true;
+//                     cacheBlocks[block_index].valid = false;
+//                     cacheBlocks[block_index].hasConflict = false;
+//                     DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
+//                                 " Addr: %lu.\n", __func__, block_index, addr);
+//                     MSHR[block_index].push_back(addr);
+//                     DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
+//                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
+//                     memoryFunctionQueue.emplace_back(
+//                         [this] (int block_index) {
+//                             processNextRead(block_index);
+//                         }, block_index);
+//                     DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
+//                                         "input %d to memoryFunctionQueue.\n",
+//                                                     __func__, block_index);
+//                     if ((!nextMemoryEvent.pending()) &&
+//                         (!nextMemoryEvent.scheduled())) {
+//                         schedule(nextMemoryEvent, nextCycle());
+//                     }
+//                     stats.readMisses++;
+//                     stats.numVertexReads++;
+//                     return true;
+//                 }
+//             }
+//         } else {
+//             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
+//                 "Addr: %lu already in MSHRs.\n", __func__, block_index, addr);
+//             if (MSHR[block_index].size() == numTgtsPerMSHR) {
+//                 DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+//                             "cacheBlocks[%d]. Rejecting request.\n",
+//                                             __func__, block_index);
+//                 stats.readRejections++;
+//                 return false;
+//             }
+//             if ((aligned_addr != cacheBlocks[block_index].addr)) {
+//                 DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
+//                             "with Addr: %lu.\n", __func__, addr,
+//                             cacheBlocks[block_index].addr);
+//                 cacheBlocks[block_index].hasConflict = true;
+//             } else {
+//                 DPRINTF(CoalesceEngine, "%s: There is room for another target "
+//                             "for cacheBlocks[%d].\n", __func__, block_index);
+//             }
+
+//             if (aligned_addr != cacheBlocks[block_index].addr) {
+//                 stats.readMisses++;
+//             } else {
+//                 stats.readHitUnderMisses++;
+//             }
+
+//             MSHR[block_index].push_back(addr);
+//             DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
+//                             "cacheBlocks[%d].\n", __func__, addr, block_index);
+//             stats.numVertexReads++;
+//             return true;
+//         }
+//     }
+// }
+
 bool
 CoalesceEngine::recvWLRead(Addr addr)
 {
-    assert(MSHR.size() <= numMSHREntries);
-
     Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
     assert(aligned_addr % peerMemoryAtomSize == 0);
     int block_index = getBlockIndex(aligned_addr);
@@ -119,11 +289,18 @@ CoalesceEngine::recvWLRead(Addr addr)
                         "This request maps to cacheBlocks[%d], aligned_addr: "
                         "%lu, and wl_offset: %d.\n", __func__, addr,
                         block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
-        assert(cacheBlocks[block_index].allocated);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
+        stats.readHits++;
+        assert(!cacheBlocks[block_index].pendingData);
+        // No cache block could be in pendingApply and pendingWB at the
+        // same time.
+        assert(!(cacheBlocks[block_index].pendingApply &&
+                cacheBlocks[block_index].pendingWB));
         // Hit
         // TODO: Add a hit latency as a param for this object.
         // Can't just schedule the nextResponseEvent for latency cycles in
@@ -144,20 +321,60 @@ CoalesceEngine::recvWLRead(Addr addr)
                         peerWLEngine->getRegisterFileSize());
         // TODO: Stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-        stats.readHits++;
+        // If they are scheduled for apply and WB those schedules should be
+        // discarded. Since there is no easy way to take items out of the
+        // function queue. Those functions check for their respective bits
+        // and skip the process if the respective bit is set to false.
+        cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].pendingWB = false;
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
 
         if (!nextResponseEvent.scheduled()) {
             schedule(nextResponseEvent, nextCycle());
         }
         stats.numVertexReads++;
         return true;
+    } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
+                (cacheBlocks[block_index].pendingData)) {
+        // Hit under miss
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit under miss.\n",
+                                                        __func__, addr);
+        stats.readHitUnderMisses++;
+        assert(!cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].needsWB);
+        assert(!cacheBlocks[block_index].needsApply);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(!cacheBlocks[block_index].pendingWB);
+
+        assert(MSHR.size() <= numMSHREntries);
+        assert(MSHR.find(block_index) != MSHR.end());
+        assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+        if (MSHR[block_index].size() == numTgtsPerMSHR) {
+            DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+                        "cacheBlocks[%d]. Rejecting request.\n",
+                                        __func__, block_index);
+            stats.readRejections++;
+            return false;
+        } else {
+            DPRINTF(CoalesceEngine,  "%s: MSHR entries are available for "
+                            "cacheBlocks[%d].\n", __func__, block_index);
+        }
+        MSHR[block_index].push_back(addr);
+        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                "for cacheBlocks[%d].\n", __func__, addr, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+        return true;
     } else {
         // miss
+        assert(cacheBlocks[block_index].addr != aligned_addr);
+        assert(MSHR.size() <= numMSHREntries);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHR.find(block_index) == MSHR.end()) {
             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
                     " %lu not found in MSHRs.\n", __func__, block_index, addr);
-            assert(MSHR.size() <= numMSHREntries);
             if (MSHR.size() == numMSHREntries) {
                 // Out of MSHR entries
                 DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
@@ -169,11 +386,12 @@ CoalesceEngine::recvWLRead(Addr addr)
             } else {
                 DPRINTF(CoalesceEngine,  "%s: MSHR "
                     "entries available.\n", __func__);
-                if (cacheBlocks[block_index].allocated) {
-                    assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+                if ((cacheBlocks[block_index].valid) ||
+                    (cacheBlocks[block_index].pendingData)) {
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                                 "with Addr: %lu.\n", __func__, addr,
                                 cacheBlocks[block_index].addr);
+                    assert(MSHR[block_index].size() <= numTgtsPerMSHR);
                     if (MSHR[block_index].size() == numTgtsPerMSHR) {
                         DPRINTF(CoalesceEngine,  "%s: Out of targets for "
                                     "cacheBlocks[%d]. Rejecting request.\n",
@@ -181,43 +399,116 @@ CoalesceEngine::recvWLRead(Addr addr)
                         stats.readRejections++;
                         return false;
                     }
-                    cacheBlocks[block_index].hasConflict = true;
+                    if ((cacheBlocks[block_index].valid) &&
+                        (cacheBlocks[block_index].busyMask == 0) &&
+                        (!cacheBlocks[block_index].pendingApply) &&
+                        (!cacheBlocks[block_index].pendingWB)) {
+                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                                    "idle state.\n", __func__, block_index);
+                        // We're in idle state
+                        // Idle: valid && !pendingApply && !pendingWB;
+                        // Note 0: needsApply has to be false. Because
+                        // A cache line enters the idle state from two
+                        // other states. First a busy state that does not
+                        // need apply (needsApply is already false) or
+                        // from pendingApplyState after being applied which
+                        // clears the needsApply bit. needsApply is useful
+                        // when a cache block has transitioned from
+                        // pendingApply to busy without the apply happening.
+                        // Note 1: pendingData does not have to be evaluated
+                        // becuase pendingData is cleared when data
+                        // arrives from the memory and valid does not
+                        // denote cleanliness of the line. Rather it
+                        // is used to differentiate between empty blocks
+                        // and the blocks that have data from memory.
+                        // pendingData denotes the transient state between
+                        // getting a miss and getting the data for that miss.
+                        // valid basically means that the data in the cache
+                        // could be used to respond to read/write requests.
+                        assert(!cacheBlocks[block_index].needsApply);
+                        assert(!cacheBlocks[block_index].pendingData);
+                        // There are no conflicts in idle state.
+                        assert(MSHR.find(block_index) == MSHR.end());
+                        if (cacheBlocks[block_index].needsWB) {
+                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs"
+                            "to be written back.\n", __func__, block_index);
+                            cacheBlocks[block_index].pendingWB = true;
+                            memoryFunctionQueue.emplace_back(
+                                [this] (int block_index) {
+                                    processNextWriteBack(block_index);
+                                }, block_index);
+                            DPRINTF(CoalesceEngine, "%s: Pushed "
+                                        "processNextWriteBack for input "
+                                        "%d to memoryFunctionQueue.\n",
+                                        __func__, block_index);
+                            if ((!nextMemoryEvent.pending()) &&
+                                (!nextMemoryEvent.scheduled())) {
+                                schedule(nextMemoryEvent, nextCycle());
+                            }
+                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
+                                    "%s.\n", __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
+                        } else {
+                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does"
+                                            "not need to be written back.\n",
+                                                        __func__, block_index);
+                            cacheBlocks[block_index].addr = aligned_addr;
+                            cacheBlocks[block_index].valid = false;
+                            cacheBlocks[block_index].busyMask = 0;
+                            cacheBlocks[block_index].needsWB = false;
+                            cacheBlocks[block_index].needsApply = false;
+                            cacheBlocks[block_index].pendingData = true;
+                            cacheBlocks[block_index].pendingApply = false;
+                            cacheBlocks[block_index].pendingWB = true;
+                            memoryFunctionQueue.emplace_back(
+                                [this] (int block_index) {
+                                    processNextRead(block_index);
+                                }, block_index);
+                            DPRINTF(CoalesceEngine, "%s: Pushed "
+                                        "processNextRead for input "
+                                        "%d to memoryFunctionQueue.\n",
+                                        __func__, block_index);
+                            if ((!nextMemoryEvent.pending()) &&
+                                (!nextMemoryEvent.scheduled())) {
+                                schedule(nextMemoryEvent, nextCycle());
+                            }
+                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
+                                    "%s.\n", __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
+                        }
+                    }
+                    // cacheBlocks[block_index].hasConflict = true;
                     MSHR[block_index].push_back(addr);
                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
+                    // TODO: Add readConflicts here.
                     stats.numVertexReads++;
-                    if ((cacheBlocks[block_index].busyMask == 0) &&
-                        (cacheBlocks[block_index].valid)) {
-                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not "
-                                            "busy. It %s in the applyQueue.\n",
-                                            __func__, block_index,
-                            applyQueue.find(block_index) ? "is" : "is not");
-                        if (!applyQueue.find(block_index)) {
-                            applyQueue.push_back(block_index);
-                            DPRINTF(CoalesceEngine,  "%s: Added %d to "
-                                        "applyQueue. applyQueue.size = %u.\n",
-                                    __func__, block_index, applyQueue.size());
-                        }
-                        assert(!applyQueue.empty());
-                        if ((!nextApplyEvent.scheduled())) {
-                            schedule(nextApplyEvent, nextCycle());
-                        }
-                    }
                     return true;
                 } else {
-                    assert(!cacheBlocks[block_index].valid);
-                    assert(MSHR[block_index].size() == 0);
                     // MSHR available and no conflict
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
                                             "Allocating a cache line for it.\n"
                                                             , __func__, addr);
+                    assert(!cacheBlocks[block_index].valid);
+                    assert(cacheBlocks[block_index].busyMask == 0);
+                    assert(!cacheBlocks[block_index].needsWB);
+                    assert(!cacheBlocks[block_index].needsApply);
+                    assert(!cacheBlocks[blokc_index].pendingData);
+                    assert(!cacheBlocks[block_index].pendingApply);
+                    assert(!cacheBlocks[block_index].pendingWB);
+                    assert(MSHR[block_index].size() == 0);
 
                     cacheBlocks[block_index].addr = aligned_addr;
                     cacheBlocks[block_index].busyMask = 0;
-                    cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
-                    cacheBlocks[block_index].hasConflict = false;
+                    cacheBlocks[block_index].needsWB = false;
+                    cacheBlocks[block_index].needsApply = false;
+                    cacheBlocks[block_index].pendingData = true;
+                    cacheBlocks[block_index].pendingApply = false;
+                    cacheBlocks[block_index].pendingWB = false;
+                    // cacheBlocks[block_index].allocated = true;
+                    // cacheBlocks[block_index].hasConflict = false;
                     DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
                                 " Addr: %lu.\n", __func__, block_index, addr);
                     MSHR[block_index].push_back(addr);
@@ -234,6 +525,9 @@ CoalesceEngine::recvWLRead(Addr addr)
                         (!nextMemoryEvent.scheduled())) {
                         schedule(nextMemoryEvent, nextCycle());
                     }
+                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+                                    __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
                     stats.readMisses++;
                     stats.numVertexReads++;
                     return true;
@@ -241,7 +535,11 @@ CoalesceEngine::recvWLRead(Addr addr)
             }
         } else {
             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
-                "Addr: %lu already in MSHRs.\n", __func__, block_index, addr);
+                "Addr: %lu already in MSHRs. It has a conflict "
+                "with addr: %lu.\n", __func__, block_index, addr,
+                                cacheBlocks[block_index].addr);
+            assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+            assert(MSHR[block_index].size() > 0);
             if (MSHR[block_index].size() == numTgtsPerMSHR) {
                 DPRINTF(CoalesceEngine,  "%s: Out of targets for "
                             "cacheBlocks[%d]. Rejecting request.\n",
@@ -249,21 +547,12 @@ CoalesceEngine::recvWLRead(Addr addr)
                 stats.readRejections++;
                 return false;
             }
-            if ((aligned_addr != cacheBlocks[block_index].addr)) {
-                DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
-                            "with Addr: %lu.\n", __func__, addr,
-                            cacheBlocks[block_index].addr);
-                cacheBlocks[block_index].hasConflict = true;
-            } else {
-                DPRINTF(CoalesceEngine, "%s: There is room for another target "
+            DPRINTF(CoalesceEngine, "%s: There is room for another target "
                             "for cacheBlocks[%d].\n", __func__, block_index);
-            }
 
-            if (aligned_addr != cacheBlocks[block_index].addr) {
-                stats.readMisses++;
-            } else {
-                stats.readHitUnderMisses++;
-            }
+            // cacheBlocks[block_index].hasConflict = true;
+            // TODO: Might want to differentiate between different misses.
+            stats.readMisses++;
 
             MSHR[block_index].push_back(addr);
             DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
@@ -324,8 +613,15 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     }
 
     if (cacheBlocks[block_index].addr == addr) {
-        assert(cacheBlocks[block_index].allocated);
+        DPRINTF(CoalesceEngine, "%s: Received read response to "
+                "fill cacheBlocks[%d].\n", __func__, block_index);
         assert(!cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].needsWB);
+        assert(!cacheBlocks[block_index].needsApply);
+        assert(cacheBlocks[block_index].pendingData);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(!cacheBlocks[block_index].pendingWB);
         assert(MSHR.find(block_index) != MSHR.end());
         pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
                                                 peerMemoryAtomSize);
@@ -335,6 +631,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                             cacheBlocks[block_index].items[i].to_string());
         }
         cacheBlocks[block_index].valid = true;
+        cacheBlocks[block_index].pendingData = false;
         delete pkt;
     }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 7db09cec11..e7655a069e 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -34,6 +34,7 @@
 #include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/push_engine.hh"
+#include "base/cprintf.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
@@ -51,24 +52,42 @@ class CoalesceEngine : public BaseMemoryEngine
     {
         WorkListItem* items;
         Addr addr;
-        uint8_t busyMask;
-        bool allocated;
+        uint64_t busyMask;
         bool valid;
+        bool needsApply;
+        bool needsWB;
+        bool pendingData;
+        bool pendingApply;
+        bool pendingWB;
+
+        bool allocated;
         bool hasConflict;
-        bool dirty;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block() {}
         Block(int num_elements):
           addr(0),
           busyMask(0),
-          allocated(false),
           valid(false),
-          hasConflict(false),
-          dirty(false)
+          needsApply(false),
+          needsWB(false),
+          pendingData(false),
+          pendingApply(false),
+          pendingWB(false),
+          allocated(false),
+          hasConflict(false)
         {
           items = new WorkListItem [num_elements];
         }
+
+        std::string to_string() {
+            return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
+                "needsApply: %s, needsWB: %s, pendingData: %s, "
+                "pendingApply: %s, pendingWB: %s}", addr, busyMask,
+                valid ? "true" : "false", needsApply ? "true" : "false",
+                needsWB ? "true" : "false", pendingData ? "true" : "false",
+                pendingApply ? "true" : "false", pendingWB ? "true" : "false");
+        }
     };
 
     struct SenderState : public Packet::SenderState
diff --git a/src/accl/graph/sega/state_machine.md b/src/accl/graph/sega/state_machine.md
new file mode 100644
index 0000000000..203c47cf02
--- /dev/null
+++ b/src/accl/graph/sega/state_machine.md
@@ -0,0 +1 @@
+# CoalesceEngine Block state machine
\ No newline at end of file

From 2b2b27ce86cd7c6d692af11e3f3f42b712c4d31b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 30 Jul 2022 23:14:08 -0700
Subject: [PATCH 138/247] Fixing cache block state machine. cont. wip

---
 src/accl/graph/sega/coalesce_engine.cc | 288 +++++++++----------------
 1 file changed, 98 insertions(+), 190 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index a0c85de2f5..8f33a2d893 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -105,177 +105,6 @@ CoalesceEngine::getBlockAddrFromBitIndex(int index)
     return peerMemoryRange.addIntlvBits(trimmed_addr);
 }
 
-// TODO: Prev implementaton of recvWLRead. Remove
-// bool
-// CoalesceEngine::recvWLRead(Addr addr)
-// {
-//     assert(MSHR.size() <= numMSHREntries);
-
-//     Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
-//     assert(aligned_addr % peerMemoryAtomSize == 0);
-//     int block_index = getBlockIndex(aligned_addr);
-//     assert(block_index < numLines);
-//     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
-//     assert(wl_offset < numElementsPerLine);
-//     DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
-//                         "This request maps to cacheBlocks[%d], aligned_addr: "
-//                         "%lu, and wl_offset: %d.\n", __func__, addr,
-//                         block_index, aligned_addr, wl_offset);
-
-//     if ((cacheBlocks[block_index].addr == aligned_addr) &&
-//         (cacheBlocks[block_index].valid)) {
-//         assert(cacheBlocks[block_index].allocated);
-//         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
-//         // Hit
-//         // TODO: Add a hit latency as a param for this object.
-//         // Can't just schedule the nextResponseEvent for latency cycles in
-//         // the future.
-//         responseQueue.push_back(std::make_tuple(addr,
-//                     cacheBlocks[block_index].items[wl_offset]));
-//         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-//                         "to responseQueue. responseQueue.size = %d, "
-//                         "responseQueueSize = %d.\n", __func__, addr,
-//                         cacheBlocks[block_index].items[wl_offset].to_string(),
-//                         responseQueue.size(),
-//                         peerWLEngine->getRegisterFileSize());
-//         DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-//                         "to responseQueue. responseQueue.size = %d, "
-//                         "responseQueueSize = %d.\n", __func__, addr,
-//                         cacheBlocks[block_index].items[wl_offset].to_string(),
-//                         responseQueue.size(),
-//                         peerWLEngine->getRegisterFileSize());
-//         // TODO: Stat to count the number of WLItems that have been touched.
-//         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-//         stats.readHits++;
-
-//         if (!nextResponseEvent.scheduled()) {
-//             schedule(nextResponseEvent, nextCycle());
-//         }
-//         stats.numVertexReads++;
-//         return true;
-//     } else {
-//         // miss
-//         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
-//         if (MSHR.find(block_index) == MSHR.end()) {
-//             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
-//                     " %lu not found in MSHRs.\n", __func__, block_index, addr);
-//             assert(MSHR.size() <= numMSHREntries);
-//             if (MSHR.size() == numMSHREntries) {
-//                 // Out of MSHR entries
-//                 DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
-//                                 "Rejecting request.\n", __func__);
-//                 // TODO: Break out read rejections into more than one stat
-//                 // based on the cause of the rejection
-//                 stats.readRejections++;
-//                 return false;
-//             } else {
-//                 DPRINTF(CoalesceEngine,  "%s: MSHR "
-//                     "entries available.\n", __func__);
-//                 if (cacheBlocks[block_index].allocated) {
-//                     assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-//                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
-//                                 "with Addr: %lu.\n", __func__, addr,
-//                                 cacheBlocks[block_index].addr);
-//                     if (MSHR[block_index].size() == numTgtsPerMSHR) {
-//                         DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-//                                     "cacheBlocks[%d]. Rejecting request.\n",
-//                                     __func__, block_index);
-//                         stats.readRejections++;
-//                         return false;
-//                     }
-//                     cacheBlocks[block_index].hasConflict = true;
-//                     MSHR[block_index].push_back(addr);
-//                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
-//                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
-//                     stats.readMisses++;
-//                     stats.numVertexReads++;
-//                     if ((cacheBlocks[block_index].busyMask == 0) &&
-//                         (cacheBlocks[block_index].valid)) {
-//                         DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not "
-//                                             "busy. It %s in the applyQueue.\n",
-//                                             __func__, block_index,
-//                             applyQueue.find(block_index) ? "is" : "is not");
-//                         if (!applyQueue.find(block_index)) {
-//                             applyQueue.push_back(block_index);
-//                             DPRINTF(CoalesceEngine,  "%s: Added %d to "
-//                                         "applyQueue. applyQueue.size = %u.\n",
-//                                     __func__, block_index, applyQueue.size());
-//                         }
-//                         assert(!applyQueue.empty());
-//                         if ((!nextApplyEvent.scheduled())) {
-//                             schedule(nextApplyEvent, nextCycle());
-//                         }
-//                     }
-//                     return true;
-//                 } else {
-//                     assert(!cacheBlocks[block_index].valid);
-//                     assert(MSHR[block_index].size() == 0);
-//                     // MSHR available and no conflict
-//                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
-//                                             "Allocating a cache line for it.\n"
-//                                                             , __func__, addr);
-
-//                     cacheBlocks[block_index].addr = aligned_addr;
-//                     cacheBlocks[block_index].busyMask = 0;
-//                     cacheBlocks[block_index].allocated = true;
-//                     cacheBlocks[block_index].valid = false;
-//                     cacheBlocks[block_index].hasConflict = false;
-//                     DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
-//                                 " Addr: %lu.\n", __func__, block_index, addr);
-//                     MSHR[block_index].push_back(addr);
-//                     DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
-//                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
-//                     memoryFunctionQueue.emplace_back(
-//                         [this] (int block_index) {
-//                             processNextRead(block_index);
-//                         }, block_index);
-//                     DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
-//                                         "input %d to memoryFunctionQueue.\n",
-//                                                     __func__, block_index);
-//                     if ((!nextMemoryEvent.pending()) &&
-//                         (!nextMemoryEvent.scheduled())) {
-//                         schedule(nextMemoryEvent, nextCycle());
-//                     }
-//                     stats.readMisses++;
-//                     stats.numVertexReads++;
-//                     return true;
-//                 }
-//             }
-//         } else {
-//             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
-//                 "Addr: %lu already in MSHRs.\n", __func__, block_index, addr);
-//             if (MSHR[block_index].size() == numTgtsPerMSHR) {
-//                 DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-//                             "cacheBlocks[%d]. Rejecting request.\n",
-//                                             __func__, block_index);
-//                 stats.readRejections++;
-//                 return false;
-//             }
-//             if ((aligned_addr != cacheBlocks[block_index].addr)) {
-//                 DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
-//                             "with Addr: %lu.\n", __func__, addr,
-//                             cacheBlocks[block_index].addr);
-//                 cacheBlocks[block_index].hasConflict = true;
-//             } else {
-//                 DPRINTF(CoalesceEngine, "%s: There is room for another target "
-//                             "for cacheBlocks[%d].\n", __func__, block_index);
-//             }
-
-//             if (aligned_addr != cacheBlocks[block_index].addr) {
-//                 stats.readMisses++;
-//             } else {
-//                 stats.readHitUnderMisses++;
-//             }
-
-//             MSHR[block_index].push_back(addr);
-//             DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
-//                             "cacheBlocks[%d].\n", __func__, addr, block_index);
-//             stats.numVertexReads++;
-//             return true;
-//         }
-//     }
-// }
-
 bool
 CoalesceEngine::recvWLRead(Addr addr)
 {
@@ -615,6 +444,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     if (cacheBlocks[block_index].addr == addr) {
         DPRINTF(CoalesceEngine, "%s: Received read response to "
                 "fill cacheBlocks[%d].\n", __func__, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
         assert(!cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
         assert(!cacheBlocks[block_index].needsWB);
@@ -632,6 +463,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         }
         cacheBlocks[block_index].valid = true;
         cacheBlocks[block_index].pendingData = false;
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
         delete pkt;
     }
 
@@ -639,7 +472,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHR[block_index].size(); i++) {
         Addr miss_addr = MSHR[block_index][i];
-        Addr aligned_miss_addr = roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+        Addr aligned_miss_addr =
+            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
             DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
@@ -662,6 +496,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         peerWLEngine->getRegisterFileSize());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
             // End of the said block
             servicedIndices.push_back(i);
             // DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cacheBlocks[%d] for "
@@ -677,15 +513,13 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         MSHR[block_index].erase(MSHR[block_index].begin() +
                                     servicedIndices[i] - bias);
         bias++;
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu has been serviced and is removed.\n",
-                    __func__, print_addr);
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu has been serviced "
+                        "and is removed.\n", __func__, print_addr);
     }
 
     if (MSHR[block_index].empty()) {
         MSHR.erase(block_index);
-        cacheBlocks[block_index].hasConflict = false;
-    } else {
-        assert(cacheBlocks[block_index].hasConflict);
+        // cacheBlocks[block_index].hasConflict = false;
     }
 
     if ((!nextResponseEvent.scheduled()) &&
@@ -726,37 +560,111 @@ CoalesceEngine::processNextResponseEvent()
 void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
-    // TODO: Parameterize all the numbers here.
     Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
-    // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int block_index = getBlockIndex(aligned_addr);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
-
-    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
-                __func__, wl.to_string(), addr);
+    DPRINTF(CoalesceEngine,  "%s: Received a write request for addr: %lu with "
+                        "wl: %s. This request maps to cacheBlocks[%d], "
+                        "aligned_addr: %lu, and wl_offset: %d.\n",
+                        __func__, addr, wl.to_string(),
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
+                "with Addr: %lu.\n", __func__, wl.to_string(), addr);
+    // Desing does not allow for write misses for now.
+    assert(cacheBlocks[block_index].addr == aligned_addr);
+    // cache state asserts
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].busyMask != 0);
+    assert(!cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingApply);
+    assert(!cacheBlocks[block_index].pendingWB);
+
+    // respective bit in busyMask for wl is set.
     assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
 
     if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) {
-        cacheBlocks[block_index].dirty = true;
+        cacheBlocks[block_index].items[wl_offset] = wl;
+        cacheBlocks[block_index].needsApply |= true;
+        // NOTE: We don't set needsWB and rely on processNextApplyEvent to
+        // set that bit.
         stats.numVertexWrites++;
     }
 
-    cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
     DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
                 __func__, block_index, wl_offset,
                 cacheBlocks[block_index].items[wl_offset].to_string());
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
 
     // TODO: Make this more general and programmable.
     if ((cacheBlocks[block_index].busyMask == 0)) {
-        DPRINTF(CoalesceEngine,  "%s: Received all the expected writes for cacheBlocks[%d]."
-                    " It does not have any taken items anymore.\n",
-                    __func__, block_index);
-        applyQueue.push_back(block_index);
-        DPRINTF(CoalesceEngine,  "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
-                __func__, block_index, applyQueue.size());
+        if (cacheBlocks[block_index].needsApply) {
+            cacheBlocks[block_index].pendingApply = true;
+            applyQueue.push_back(block_index);
+            DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
+                            "applyQueue.\n", __func__, block_index);
+        } else {
+            assert(MSHR.size() <= numMSHREntries);
+            // cache line has conflict.
+            if (MSHR.find(block_index) != MSHR.end()) {
+                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
+                                    "conflict.\n", __func__, block_index);
+                if (cacheBlocks[block_index].needsWB) {
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write"
+                                            " back.\n", __func__, block_index);
+                    cacheBlocks[block_index].pendingWB = true;
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index) {
+                            processNextWriteBack(block_index);
+                        }, block_index);
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack "
+                                    "for input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                } else {
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does not need"
+                                    " a write back.\n", __func__, block_index);
+                    Addr miss_addr = MSHR[block_index].front();
+                    Addr aligned_miss_addr =
+                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+                    DPRINTF(CoalesceEngine, "%s: First conflicting address for"
+                        " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
+                        __func__, block_index, miss_addr, aligned_miss_addr);
+                    cacheBlocks[block_index].addr = aligned_miss_addr;
+                    cacheBlocks[block_index].valid = false;
+                    cacheBlocks[block_index].busyMask = 0;
+                    cacheBlocks[block_index].needsWB = false;
+                    cacheBlocks[block_index].needsApply = false;
+                    cacheBlocks[block_index].pendingData = true;
+                    cacheBlocks[block_index].pendingApply = false;
+                    cacheBlocks[block_index].pendingWB = false;
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index) {
+                            processNextRead(block_index);
+                        }, block_index);
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead "
+                                    "for input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                }
+            } else {
+                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                        "idle state now.\n", __func__, block_index);
+            }
+        }
     }
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
 
     if ((!applyQueue.empty()) &&
         (!nextApplyEvent.scheduled())) {

From f138726a23ee6395f6c8f55a278677690cb57c83 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 31 Jul 2022 14:32:04 -0700
Subject: [PATCH 139/247] Completed cache block state machine. Needs rework of
 push interface.

---
 src/accl/graph/sega/coalesce_engine.cc | 205 +++++++++++++------------
 src/accl/graph/sega/coalesce_engine.hh |   7 +-
 2 files changed, 109 insertions(+), 103 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 8f33a2d893..904889f12b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -198,7 +198,11 @@ CoalesceEngine::recvWLRead(Addr addr)
         return true;
     } else {
         // miss
-        assert(cacheBlocks[block_index].addr != aligned_addr);
+        // FIXME: Kake this assert work. It will break if the cache block
+        // is cold and addr or aligned_addr is 0. It fails because cache block
+        // addr field is initialized to 0. Unfortunately Addr type is unsigned.
+        // So you can not initialized addr to -1.
+        // assert(cacheBlocks[block_index].addr != aligned_addr);
         assert(MSHR.size() <= numMSHREntries);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHR.find(block_index) == MSHR.end()) {
@@ -220,14 +224,6 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                                 "with Addr: %lu.\n", __func__, addr,
                                 cacheBlocks[block_index].addr);
-                    assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-                    if (MSHR[block_index].size() == numTgtsPerMSHR) {
-                        DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-                                    "cacheBlocks[%d]. Rejecting request.\n",
-                                    __func__, block_index);
-                        stats.readRejections++;
-                        return false;
-                    }
                     if ((cacheBlocks[block_index].valid) &&
                         (cacheBlocks[block_index].busyMask == 0) &&
                         (!cacheBlocks[block_index].pendingApply) &&
@@ -288,7 +284,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                             cacheBlocks[block_index].needsApply = false;
                             cacheBlocks[block_index].pendingData = true;
                             cacheBlocks[block_index].pendingApply = false;
-                            cacheBlocks[block_index].pendingWB = true;
+                            cacheBlocks[block_index].pendingWB = false;
                             memoryFunctionQueue.emplace_back(
                                 [this] (int block_index) {
                                     processNextRead(block_index);
@@ -323,7 +319,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                     assert(cacheBlocks[block_index].busyMask == 0);
                     assert(!cacheBlocks[block_index].needsWB);
                     assert(!cacheBlocks[block_index].needsApply);
-                    assert(!cacheBlocks[blokc_index].pendingData);
+                    assert(!cacheBlocks[block_index].pendingData);
                     assert(!cacheBlocks[block_index].pendingApply);
                     assert(!cacheBlocks[block_index].pendingWB);
                     assert(MSHR[block_index].size() == 0);
@@ -607,6 +603,10 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
             applyQueue.push_back(block_index);
             DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
                             "applyQueue.\n", __func__, block_index);
+            if ((!applyQueue.empty()) &&
+                (!nextApplyEvent.scheduled())) {
+                schedule(nextApplyEvent, nextCycle());
+            }
         } else {
             assert(MSHR.size() <= numMSHREntries);
             // cache line has conflict.
@@ -666,70 +666,71 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
 
-    if ((!applyQueue.empty()) &&
-        (!nextApplyEvent.scheduled())) {
-        schedule(nextApplyEvent, nextCycle());
-    }
-
 }
 
 void
 CoalesceEngine::processNextApplyEvent()
 {
     int block_index = applyQueue.front();
+    DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
+                "cacheBlock[%d] to be applied.\n", __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+            __func__, cacheBlocks[block_index].to_string());
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].needsApply);
+    assert(!cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingWB);
 
-    if (cacheBlocks[block_index].busyMask != 0) {
-        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid "
-                    "apply process. Therefore, ignoring the apply schedule.\n",
-                    __func__, block_index);
-        stats.falseApplySchedules++;
-    } else if (!cacheBlocks[block_index].dirty) {
-        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has no change. "
-                    "Therefore, no apply needed.\n", __func__, block_index);
-    } else {
-        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] could be applied.\n",
-                                                    __func__, block_index);
-        for (int i = 0; i < numElementsPerLine; i++) {
-            uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
-            uint32_t new_prop = std::min(
-                                cacheBlocks[block_index].items[i].prop,
-                                cacheBlocks[block_index].items[i].tempProp);
-
-            if (new_prop != old_prop) {
-                cacheBlocks[block_index].items[i].tempProp = new_prop;
-                cacheBlocks[block_index].items[i].prop = new_prop;
-                DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n", __func__,
-                    cacheBlocks[block_index].addr + (i  * sizeof(WorkListItem)),
-                    cacheBlocks[block_index].items[i].to_string());
-                int bit_index =
-                        getBitIndexBase(cacheBlocks[block_index].addr) + i;
-                if ((cacheBlocks[block_index].items[i].degree != 0) &&
-                    (needsPush[bit_index] == 0)) {
-                    // If the respective bit in the bit vector is set
-                    // there is no need to try and resend it.
+    if (cacheBlocks[block_index].pendingApply) {
+        assert(cacheBlocks[block_index].busyMask == 0);
+        for (int index = 0; index < numElementsPerLine; index++) {
+            uint32_t current_prop = cacheBlocks[block_index].items[index].prop;
+            uint32_t new_prop = std::min(current_prop,
+                            cacheBlocks[block_index].items[index].tempProp);
+            if (new_prop != current_prop) {
+                cacheBlocks[block_index].items[index].tempProp = new_prop;
+                cacheBlocks[block_index].items[index].prop = new_prop;
+                DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu][%d]: %s.\n",
+                    __func__, cacheBlocks[block_index].addr, index,
+                    cacheBlocks[block_index].items[index].to_string());
+
+                int bit_index_base =
+                            getBitIndexBase(cacheBlocks[block_index].addr);
+                if ((needsPush[bit_index_base + index] == 0) &&
+                    (cacheBlocks[block_index].items[index].degree != 0)) {
                     if (peerPushEngine->allocatePushSpace()) {
                         peerPushEngine->recvWLItem(
-                            cacheBlocks[block_index].items[i]);
+                            cacheBlocks[block_index].items[index]);
                     } else {
-                        needsPush[bit_index] = 1;
+                        needsPush[bit_index_base + index] = 1;
                     }
                 }
             }
         }
-    }
+        cacheBlocks[block_index].needsWB = true;
+        cacheBlocks[block_index].needsApply = false;
+        cacheBlocks[block_index].pendingApply = false;
 
-    // TODO: This is where eviction policy goes
-    if ((cacheBlocks[block_index].hasConflict) &&
-        (cacheBlocks[block_index].busyMask == 0)) {
-        memoryFunctionQueue.emplace_back([this] (int block_index) {
+        assert(MSHR.size() < numMSHREntries);
+        if (MSHR.find(block_index) != MSHR.end()) {
+            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
+                                "conflicts.\n", __func__, block_index);
+            cacheBlocks[block_index].pendingWB = true;
+            memoryFunctionQueue.emplace_back([this] (int block_index) {
                 processNextWriteBack(block_index);
             }, block_index);
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input %d "
-                        "to memoryFunctionQueue.\n", __func__, block_index);
-        if ((!nextMemoryEvent.pending()) &&
-            (!nextMemoryEvent.scheduled())) {
-            schedule(nextMemoryEvent, nextCycle());
+            DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input"
+                    " %d to memoryFunctionQueue.\n", __func__, block_index);
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+        } else {
+            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                    "idle state now.\n", __func__, block_index);
         }
+        DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
     }
 
     applyQueue.pop_front();
@@ -770,6 +771,17 @@ CoalesceEngine::processNextMemoryEvent()
 void
 CoalesceEngine::processNextRead(int block_index)
 {
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n",
+                                            __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+        __func__, block_index, cacheBlocks[block_index].to_string());
+    assert(!cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].busyMask == 0);
+    assert(!cacheBlocks[block_index].needsWB);
+    assert(!cacheBlocks[block_index].needsApply);
+    assert(cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingApply);
+    assert(!cacheBlocks[block_index].pendingWB);
     PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
                                     peerMemoryAtomSize);
     DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
@@ -781,54 +793,53 @@ CoalesceEngine::processNextRead(int block_index)
 void
 CoalesceEngine::processNextWriteBack(int block_index)
 {
-    // Why would we write it back if it does not have a conflict?
-    assert(cacheBlocks[block_index].hasConflict);
-
-    if ((cacheBlocks[block_index].busyMask != 0) ||
-        (applyQueue.find(block_index))) {
-        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid "
-                "writeback process. Therefore, ignoring the apply schedule.\n",
-                    __func__, block_index);
-        // FIXME: Fix the name of this stat.
-        stats.falseEvictSchedules++;
-    } else {
-        if (cacheBlocks[block_index].dirty) {
-            DPRINTF(CoalesceEngine,  "%s: Change observed on "
-                    "cacheBlocks[%d].\n", __func__, block_index);
-            PacketPtr write_pkt = createWritePacket(
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n",
+                                                __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].needsWB);
+    assert(!cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingApply);
+
+    // Why would we write it back if it does not have a conflict.
+    assert(MSHR.size() <= numMSHREntries);
+    assert(MSHR.find(block_index) != MSHR.end());
+    if (cacheBlocks[block_index].pendingWB) {
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].needsApply);
+        PacketPtr pkt = createWritePacket(
                 cacheBlocks[block_index].addr, peerMemoryAtomSize,
                 (uint8_t*) cacheBlocks[block_index].items);
-            DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
+        DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
                         "Addr: %lu, size = %d.\n", __func__,
-                        write_pkt->getAddr(), write_pkt->getSize());
-            memPort.sendPacket(write_pkt);
-        } else {
-            DPRINTF(CoalesceEngine, "%s: No change observed on "
-                            "cacheBlocks[%d]. No write back needed.\n",
-                                            __func__, block_index);
-        }
-        assert(!MSHR[block_index].empty());
+                        pkt->getAddr(), pkt->getSize());
+        memPort.sendPacket(pkt);
+        cacheBlocks[block_index].needsWB = false;
+        cacheBlocks[block_index].pendingWB = false;
+
         Addr miss_addr = MSHR[block_index].front();
-        DPRINTF(CoalesceEngine,  "%s: First conflicting address for "
-                                    "cacheBlocks[%d] is Addr: %lu.\n",
-                                    __func__, block_index, miss_addr);
         Addr aligned_miss_addr =
-            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+        DPRINTF(CoalesceEngine, "%s: First conflicting address for"
+                    " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
+                    __func__, block_index, miss_addr, aligned_miss_addr);
 
         cacheBlocks[block_index].addr = aligned_miss_addr;
-        cacheBlocks[block_index].busyMask = 0;
-        cacheBlocks[block_index].allocated = true;
         cacheBlocks[block_index].valid = false;
-        cacheBlocks[block_index].hasConflict = true;
-        cacheBlocks[block_index].dirty = false;
-        DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for "
-                "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
-
+        cacheBlocks[block_index].busyMask = 0;
+        cacheBlocks[block_index].needsWB = false;
+        cacheBlocks[block_index].needsApply = false;
+        cacheBlocks[block_index].pendingData = true;
+        cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].pendingWB = false;
         memoryFunctionQueue.emplace_back([this] (int block_index) {
-                processNextRead(block_index);
-            }, block_index);
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input %d to "
-                            "memoryFunctionQueue.\n", __func__, block_index);
+            processNextRead(block_index);
+        }, block_index);
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
+                " %d to memoryFunctionQueue.\n", __func__, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
     }
 }
 
@@ -866,7 +877,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
             // }
             return std::make_tuple(true, it);
         } else if (!((cacheBlocks[block_index].addr == addr) &&
-                    (cacheBlocks[block_index].allocated))) {
+                    (cacheBlocks[block_index].pendingData))) {
             // score += numElementsPerLine;
             // if (current_score > score) {
             //     score = current_score;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index e7655a069e..2ba0b62aaf 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -59,9 +59,6 @@ class CoalesceEngine : public BaseMemoryEngine
         bool pendingData;
         bool pendingApply;
         bool pendingWB;
-
-        bool allocated;
-        bool hasConflict;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block() {}
@@ -73,9 +70,7 @@ class CoalesceEngine : public BaseMemoryEngine
           needsWB(false),
           pendingData(false),
           pendingApply(false),
-          pendingWB(false),
-          allocated(false),
-          hasConflict(false)
+          pendingWB(false)
         {
           items = new WorkListItem [num_elements];
         }

From 4138a240b59a7d1da2370ff87d2848787a85ec09 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 2 Aug 2022 22:33:54 -0700
Subject: [PATCH 140/247] Fixing scheduling error of memory functions.

---
 src/accl/graph/SConscript              |  32 -----
 src/accl/graph/base/data_structs.hh    |   2 +-
 src/accl/graph/sega/SConscript         |   9 +-
 src/accl/graph/sega/coalesce_engine.cc | 176 ++++++++++++++-----------
 src/accl/graph/sega/coalesce_engine.hh |  24 ++--
 5 files changed, 120 insertions(+), 123 deletions(-)
 delete mode 100644 src/accl/graph/SConscript

diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
deleted file mode 100644
index 5dffd1a396..0000000000
--- a/src/accl/graph/SConscript
+++ /dev/null
@@ -1,32 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2016 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Import('*')
-
-DebugFlag('SEGAStructureSize')
-CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
-                    'WLEngine', 'BaseMemoryEngine'])
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 707b57c56f..830f1ecc16 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -47,7 +47,7 @@ struct __attribute__ ((packed)) WorkListItem
     std::string to_string()
     {
         return csprintf(
-        "WorkListItem{temp_prop: %u, prop: %u, degree: %u, edgeIndex: %u}",
+        "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
         tempProp, prop, degree, edgeIndex);
     }
 
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 81a29df6af..4c398b5ccd 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -39,10 +39,15 @@ Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
 
-DebugFlag('BaseMemoryEngine')
 DebugFlag('ApplyUpdates')
+DebugFlag('BaseMemoryEngine')
+DebugFlag('BitVector')
 DebugFlag('CenteralController')
-DebugFlag('CoalesceEngine')
 DebugFlag('CacheBlockState')
+DebugFlag('CoalesceEngine')
 DebugFlag('PushEngine')
+DebugFlag('SEGAStructureSize')
 DebugFlag('WLEngine')
+
+CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
+                    'WLEngine', 'BaseMemoryEngine'])
\ No newline at end of file
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 904889f12b..da2bc54c19 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -33,8 +33,9 @@
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
-#include "debug/CoalesceEngine.hh"
+#include "debug/BitVector.hh"
 #include "debug/CacheBlockState.hh"
+#include "debug/CoalesceEngine.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
 
@@ -76,6 +77,13 @@ CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
     peerWLEngine = wl_engine;
 }
 
+DrainState
+CoalesceEngine::drain()
+{
+    DPRINTF(CoalesceEngine, "%s: drain called.\n");
+    return DrainState::Drained;
+}
+
 // addr should be aligned to peerMemoryAtomSize
 int
 CoalesceEngine::getBlockIndex(Addr addr)
@@ -156,6 +164,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         // and skip the process if the respective bit is set to false.
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].pendingWB = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
 
@@ -198,7 +207,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         return true;
     } else {
         // miss
-        // FIXME: Kake this assert work. It will break if the cache block
+        // FIXME: Make this assert work. It will break if the cache block
         // is cold and addr or aligned_addr is 0. It fails because cache block
         // addr field is initialized to 0. Unfortunately Addr type is unsigned.
         // So you can not initialized addr to -1.
@@ -258,10 +267,11 @@ CoalesceEngine::recvWLRead(Addr addr)
                             DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs"
                             "to be written back.\n", __func__, block_index);
                             cacheBlocks[block_index].pendingWB = true;
+                            cacheBlocks[block_index].lastChangedTick = curTick();
                             memoryFunctionQueue.emplace_back(
-                                [this] (int block_index) {
-                                    processNextWriteBack(block_index);
-                                }, block_index);
+                                [this] (int block_index, Tick schedule_tick) {
+                                processNextWriteBack(block_index, schedule_tick);
+                            }, block_index, curTick());
                             DPRINTF(CoalesceEngine, "%s: Pushed "
                                         "processNextWriteBack for input "
                                         "%d to memoryFunctionQueue.\n",
@@ -274,7 +284,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                                     "%s.\n", __func__, block_index,
                                     cacheBlocks[block_index].to_string());
                         } else {
-                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does"
+                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does "
                                             "not need to be written back.\n",
                                                         __func__, block_index);
                             cacheBlocks[block_index].addr = aligned_addr;
@@ -285,10 +295,11 @@ CoalesceEngine::recvWLRead(Addr addr)
                             cacheBlocks[block_index].pendingData = true;
                             cacheBlocks[block_index].pendingApply = false;
                             cacheBlocks[block_index].pendingWB = false;
+                            cacheBlocks[block_index].lastChangedTick = curTick();
                             memoryFunctionQueue.emplace_back(
-                                [this] (int block_index) {
-                                    processNextRead(block_index);
-                                }, block_index);
+                                [this] (int block_index, Tick schedule_tick) {
+                                    processNextRead(block_index, schedule_tick);
+                                }, block_index, curTick());
                             DPRINTF(CoalesceEngine, "%s: Pushed "
                                         "processNextRead for input "
                                         "%d to memoryFunctionQueue.\n",
@@ -332,17 +343,16 @@ CoalesceEngine::recvWLRead(Addr addr)
                     cacheBlocks[block_index].pendingData = true;
                     cacheBlocks[block_index].pendingApply = false;
                     cacheBlocks[block_index].pendingWB = false;
-                    // cacheBlocks[block_index].allocated = true;
-                    // cacheBlocks[block_index].hasConflict = false;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
                     DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
                                 " Addr: %lu.\n", __func__, block_index, addr);
                     MSHR[block_index].push_back(addr);
                     DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
                     memoryFunctionQueue.emplace_back(
-                        [this] (int block_index) {
-                            processNextRead(block_index);
-                        }, block_index);
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextRead(block_index, schedule_tick);
+                        }, block_index, curTick());
                     DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
                                         "input %d to memoryFunctionQueue.\n",
                                                     __func__, block_index);
@@ -415,7 +425,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         WorkListItem* items = pkt->getPtr<WorkListItem>();
         int push_needed = 0;
         // No applying of the line needed.
-        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+        DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                             __func__, needsPush.count());
         assert(peerPushEngine->getNumRetries() == needsPush.count());
         for (int i = 0; i < numElementsPerLine; i++) {
@@ -427,7 +437,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             push_needed += needsPush[it + i];
             needsPush[it + i] = 0;
         }
-        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+        DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                             __func__, needsPush.count());
         peerPushEngine->deallocatePushSpace(
                                 numElementsPerLine - push_needed);
@@ -459,6 +469,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         }
         cacheBlocks[block_index].valid = true;
         cacheBlocks[block_index].pendingData = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
         delete pkt;
@@ -492,6 +503,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         peerWLEngine->getRegisterFileSize());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+            cacheBlocks[block_index].lastChangedTick = curTick();
             DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
             // End of the said block
@@ -590,6 +602,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     }
 
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
+    cacheBlocks[block_index].lastChangedTick = curTick();
     DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
                 __func__, block_index, wl_offset,
                 cacheBlocks[block_index].items[wl_offset].to_string());
@@ -600,6 +613,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     if ((cacheBlocks[block_index].busyMask == 0)) {
         if (cacheBlocks[block_index].needsApply) {
             cacheBlocks[block_index].pendingApply = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
             applyQueue.push_back(block_index);
             DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
                             "applyQueue.\n", __func__, block_index);
@@ -617,10 +631,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write"
                                             " back.\n", __func__, block_index);
                     cacheBlocks[block_index].pendingWB = true;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
                     memoryFunctionQueue.emplace_back(
-                        [this] (int block_index) {
-                            processNextWriteBack(block_index);
-                        }, block_index);
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextWriteBack(block_index, schedule_tick);
+                        }, block_index, curTick());
                     DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack "
                                     "for input %d to memoryFunctionQueue.\n",
                                                     __func__, block_index);
@@ -645,10 +660,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     cacheBlocks[block_index].pendingData = true;
                     cacheBlocks[block_index].pendingApply = false;
                     cacheBlocks[block_index].pendingWB = false;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
                     memoryFunctionQueue.emplace_back(
-                        [this] (int block_index) {
-                            processNextRead(block_index);
-                        }, block_index);
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextRead(block_index, schedule_tick);
+                        }, block_index, curTick());
                     DPRINTF(CoalesceEngine, "%s: Pushed processNextRead "
                                     "for input %d to memoryFunctionQueue.\n",
                                                     __func__, block_index);
@@ -710,15 +726,18 @@ CoalesceEngine::processNextApplyEvent()
         cacheBlocks[block_index].needsWB = true;
         cacheBlocks[block_index].needsApply = false;
         cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
 
         assert(MSHR.size() < numMSHREntries);
         if (MSHR.find(block_index) != MSHR.end()) {
             DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
                                 "conflicts.\n", __func__, block_index);
             cacheBlocks[block_index].pendingWB = true;
-            memoryFunctionQueue.emplace_back([this] (int block_index) {
-                processNextWriteBack(block_index);
-            }, block_index);
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            memoryFunctionQueue.emplace_back(
+                [this] (int block_index, Tick schedule_tick) {
+                processNextWriteBack(block_index, schedule_tick);
+            }, block_index, curTick());
             DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input"
                     " %d to memoryFunctionQueue.\n", __func__, block_index);
             if ((!nextMemoryEvent.pending()) &&
@@ -750,12 +769,14 @@ CoalesceEngine::processNextMemoryEvent()
 
     DPRINTF(CoalesceEngine, "%s: Processing another "
                         "memory function.\n", __func__);
-    std::function<void(int)> next_memory_function;
+    std::function<void(int, Tick)> next_memory_function;
     int next_memory_function_input;
+    Tick next_memory_function_tick;
     std::tie(
         next_memory_function,
-        next_memory_function_input) = memoryFunctionQueue.front();
-    next_memory_function(next_memory_function_input);
+        next_memory_function_input,
+        next_memory_function_tick) = memoryFunctionQueue.front();
+    next_memory_function(next_memory_function_input, next_memory_function_tick);
     memoryFunctionQueue.pop_front();
     DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
                                 "memoryFunctionQueue.size = %d.\n", __func__,
@@ -769,12 +790,16 @@ CoalesceEngine::processNextMemoryEvent()
 }
 
 void
-CoalesceEngine::processNextRead(int block_index)
+CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
 {
     DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n",
                                             __func__, block_index);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
         __func__, block_index, cacheBlocks[block_index].to_string());
+    // A cache block should not be touched while it's waiting for data.
+    assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
+    //
+
     assert(!cacheBlocks[block_index].valid);
     assert(cacheBlocks[block_index].busyMask == 0);
     assert(!cacheBlocks[block_index].needsWB);
@@ -791,23 +816,25 @@ CoalesceEngine::processNextRead(int block_index)
 }
 
 void
-CoalesceEngine::processNextWriteBack(int block_index)
+CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
 {
     DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n",
                                                 __func__, block_index);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
-    assert(cacheBlocks[block_index].valid);
-    assert(cacheBlocks[block_index].needsWB);
-    assert(!cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingApply);
-
-    // Why would we write it back if it does not have a conflict.
-    assert(MSHR.size() <= numMSHREntries);
-    assert(MSHR.find(block_index) != MSHR.end());
-    if (cacheBlocks[block_index].pendingWB) {
+    if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
+        assert(cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
+        assert(cacheBlocks[block_index].needsWB);
         assert(!cacheBlocks[block_index].needsApply);
+        assert(!cacheBlocks[block_index].pendingData);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(cacheBlocks[block_index].pendingWB);
+
+        // Why would we write it back if it does not have a conflict.
+        assert(MSHR.size() <= numMSHREntries);
+        assert(MSHR.find(block_index) != MSHR.end());
+
         PacketPtr pkt = createWritePacket(
                 cacheBlocks[block_index].addr, peerMemoryAtomSize,
                 (uint8_t*) cacheBlocks[block_index].items);
@@ -833,13 +860,21 @@ CoalesceEngine::processNextWriteBack(int block_index)
         cacheBlocks[block_index].pendingData = true;
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].pendingWB = false;
-        memoryFunctionQueue.emplace_back([this] (int block_index) {
-            processNextRead(block_index);
-        }, block_index);
+        cacheBlocks[block_index].lastChangedTick = curTick();
+        memoryFunctionQueue.emplace_back(
+            [this] (int block_index, Tick schedule_tick) {
+            processNextRead(block_index, schedule_tick);
+        }, block_index, curTick());
         DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
                 " %d to memoryFunctionQueue.\n", __func__, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
+    } else {
+        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a "
+                            "write back has been scheduled for it. Ignoring "
+                            "the current write back scheduled at tick %lu for "
+                            "the right function scheduled later.\n",
+                            __func__, block_index, schedule_tick);
     }
 }
 
@@ -863,9 +898,14 @@ CoalesceEngine::getOptimalBitVectorSlice()
         // current_score += current_popcount;
         Addr addr = getBlockAddrFromBitIndex(it);
         int block_index = getBlockIndex(addr);
-        if ((cacheBlocks[block_index].valid) &&
-            (cacheBlocks[block_index].addr == addr) &&
-            (cacheBlocks[block_index].busyMask == 0)) {
+        // Idle state: valid && !pendingApply && !pendingWB
+        if ((cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].valid) &&
+            (cacheBlocks[block_index].busyMask == 0) &&
+            (!cacheBlocks[block_index].pendingApply) &&
+            (!cacheBlocks[block_index].pendingWB)) {
+            assert(!cacheBlocks[block_index].needsApply);
+            assert(!cacheBlocks[block_index].pendingData);
             // current_score += numElementsPerLine * 2;
             // if (current_score > score) {
             //     score = current_score;
@@ -876,8 +916,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
             //     }
             // }
             return std::make_tuple(true, it);
-        } else if (!((cacheBlocks[block_index].addr == addr) &&
-                    (cacheBlocks[block_index].pendingData))) {
+        } else if (cacheBlocks[block_index].addr != addr) {
             // score += numElementsPerLine;
             // if (current_score > score) {
             //     score = current_score;
@@ -893,7 +932,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
 }
 
 void
-CoalesceEngine::processNextPushRetry(int slice_base_2)
+CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
 {
     bool hit_in_cache;
     int slice_base;
@@ -907,17 +946,11 @@ CoalesceEngine::processNextPushRetry(int slice_base_2)
             assert(cacheBlocks[block_index].busyMask == 0);
 
             int push_needed = 0;
-            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+            DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                                     __func__, needsPush.count());
             assert(peerPushEngine->getNumRetries() == needsPush.count());
 
             for (int i = 0; i < numElementsPerLine; i++) {
-                // TODO: Make this more programmable
-                uint32_t new_prop = std::min(
-                                cacheBlocks[block_index].items[i].prop,
-                                cacheBlocks[block_index].items[i].tempProp);
-                cacheBlocks[block_index].items[i].tempProp = new_prop;
-                cacheBlocks[block_index].items[i].prop = new_prop;
                 if (needsPush[slice_base + i] == 1) {
                     peerPushEngine->recvWLItemRetry(
                         cacheBlocks[block_index].items[i]);
@@ -925,24 +958,11 @@ CoalesceEngine::processNextPushRetry(int slice_base_2)
                 push_needed +=  needsPush[slice_base + i];
                 needsPush[slice_base + i] = 0;
             }
-            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+            DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                                     __func__, needsPush.count());
-            peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
+            peerPushEngine->deallocatePushSpace(
+                                            numElementsPerLine - push_needed);
             assert(peerPushEngine->getNumRetries() == needsPush.count());
-            if (applyQueue.find(block_index)) {
-                applyQueue.erase(block_index);
-                if (applyQueue.empty() && nextApplyEvent.scheduled()) {
-                    deschedule(nextApplyEvent);
-                }
-                if (cacheBlocks[block_index].hasConflict) {
-                    memoryFunctionQueue.emplace_back([this] (int block_index) {
-                        processNextWriteBack(block_index);
-                    }, block_index);
-                    DPRINTF(CoalesceEngine, "%s: Pushed nextWriteBackEvent for"
-                                        " input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                }
-            }
         } else {
             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
             SenderState* sender_state = new SenderState(true);
@@ -958,9 +978,10 @@ CoalesceEngine::processNextPushRetry(int slice_base_2)
     }
 
     if (numRetriesReceived > 0) {
-        memoryFunctionQueue.emplace_back([this] (int slice_base) {
-            processNextPushRetry(slice_base);
-        }, 0);
+        memoryFunctionQueue.emplace_back(
+            [this] (int slice_base, Tick schedule_tick) {
+            processNextPushRetry(slice_base, schedule_tick);
+        }, 0, curTick());
         DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input "
                                     "0 to memoryFunctionQueue.\n", __func__);
     }
@@ -990,9 +1011,10 @@ CoalesceEngine::recvPushRetry()
     assert(numRetriesReceived == 1);
 
     // TODO: Pass slice_base to getOptimalBitVectorSlice
-    memoryFunctionQueue.emplace_back([this] (int slice_base) {
-        processNextPushRetry(slice_base);
-    }, 0);
+    memoryFunctionQueue.emplace_back(
+        [this] (int slice_base, Tick schedule_tick) {
+        processNextPushRetry(slice_base, schedule_tick);
+    }, 0, curTick());
     DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input 0 to "
                                         "memoryFunctionQueue.\n", __func__);
     if ((!nextMemoryEvent.pending()) &&
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 2ba0b62aaf..ce6e0daca6 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -59,6 +59,7 @@ class CoalesceEngine : public BaseMemoryEngine
         bool pendingData;
         bool pendingApply;
         bool pendingWB;
+        Tick lastChangedTick;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block() {}
@@ -70,7 +71,8 @@ class CoalesceEngine : public BaseMemoryEngine
           needsWB(false),
           pendingData(false),
           pendingApply(false),
-          pendingWB(false)
+          pendingWB(false),
+          lastChangedTick(0)
         {
           items = new WorkListItem [num_elements];
         }
@@ -78,10 +80,11 @@ class CoalesceEngine : public BaseMemoryEngine
         std::string to_string() {
             return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
                 "needsApply: %s, needsWB: %s, pendingData: %s, "
-                "pendingApply: %s, pendingWB: %s}", addr, busyMask,
-                valid ? "true" : "false", needsApply ? "true" : "false",
-                needsWB ? "true" : "false", pendingData ? "true" : "false",
-                pendingApply ? "true" : "false", pendingWB ? "true" : "false");
+                "pendingApply: %s, pendingWB: %s, lastChangedTick: %lu}",
+                addr, busyMask, valid ? "true" : "false",
+                needsApply ? "true" : "false", needsWB ? "true" : "false",
+                pendingData ? "true" : "false", pendingApply ? "true" : "false",
+                pendingWB ? "true" : "false", lastChangedTick);
         }
     };
 
@@ -114,10 +117,10 @@ class CoalesceEngine : public BaseMemoryEngine
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
-    void processNextRead(int block_index);
-    void processNextWriteBack(int block_index);
-    void processNextPushRetry(int slice_base);
-    std::deque<std::tuple<std::function<void(int)>, int>> memoryFunctionQueue;
+    void processNextRead(int block_index, Tick schedule_tick);
+    void processNextWriteBack(int block_index, Tick schedule_tick);
+    void processNextPushRetry(int slice_base, Tick schedule_tick);
+    std::deque<std::tuple<std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
 
     EventFunctionWrapper nextResponseEvent;
     void processNextResponseEvent();
@@ -151,12 +154,11 @@ class CoalesceEngine : public BaseMemoryEngine
 
   public:
     PARAMS(CoalesceEngine);
-
     CoalesceEngine(const Params &params);
+    virtual DrainState drain() override;
 
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
-
     void registerWLEngine(WLEngine* wl_engine);
 
     void recvPushRetry();

From 1194dc3ec83a9b78acfa4487cbd2552eed74c317 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 3 Aug 2022 12:41:28 -0700
Subject: [PATCH 141/247] Fixing incorrect assert.

---
 src/accl/graph/sega/coalesce_engine.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index da2bc54c19..21dd746aad 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -728,7 +728,7 @@ CoalesceEngine::processNextApplyEvent()
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].lastChangedTick = curTick();
 
-        assert(MSHR.size() < numMSHREntries);
+        assert(MSHR.size() <= numMSHREntries);
         if (MSHR.find(block_index) != MSHR.end()) {
             DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
                                 "conflicts.\n", __func__, block_index);

From c1d92aed296ca6827fb75047216c32efbe477b98 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 5 Aug 2022 13:37:54 -0700
Subject: [PATCH 142/247] Updating memory address mapping and interface for
 push coalesce.

---
 configs/accl/sega.py                      | 30 ++++++++++-------
 src/accl/graph/base/base_reduce_engine.cc |  2 +-
 src/accl/graph/base/base_reduce_engine.hh |  3 +-
 src/accl/graph/base/data_structs.hh       | 19 +++++++++++
 src/accl/graph/sega/PushEngine.py         |  3 +-
 src/accl/graph/sega/push_engine.cc        | 40 ++++++++++++++++-------
 src/accl/graph/sega/push_engine.hh        | 35 +++++++++++++++-----
 7 files changed, 96 insertions(+), 36 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7577331f2b..26488ef69d 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -8,20 +8,23 @@
 class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
-        self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=2,
+        self.push_engine = PushEngine(base_edge_addr=0,
+                                    push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    outstanding_mem_req_queue_size=1,
-                                    resp_queue_size=1)
+                                    resp_queue_size=64)
+        # self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
+        #                             push_req_queue_size=32,
+        #                             attached_memory_atom_size=64,
+        #                             resp_queue_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
-                                    cache_size="32B",
-                                    num_mshr_entry=1,
-                                    num_tgts_per_mshr=1)
+                                    cache_size="8MiB",
+                                    num_mshr_entry=32,
+                                    num_tgts_per_mshr=16)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=1,
-                                on_the_fly_update_map_size=1)
+                                update_queue_size=64,
+                                register_file_size=32)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
@@ -74,10 +77,15 @@ def __init__(self,
                             latency="30ns")
             )
             edge_mem_ctrl.append(
-                SimpleMemory(range=self._edge_ranges[i],
+                # SimpleMemory(range=self._edge_ranges[i],
+                #             bandwidth="4.8GB/s",
+                #             latency="30ns",
+                #             image_file=f"{graph_path}/edgelist_{i}")
+                SimpleMemory(range=AddrRange(self._edge_chunk_size),
                             bandwidth="4.8GB/s",
                             latency="30ns",
-                            image_file=f"{graph_path}/edgelist_{i}")
+                            image_file=f"{graph_path}/edgelist_{i}",
+                            in_addr_map=False)
             )
         self.vertex_mem_ctrl = vertex_mem_ctrl
         self.edge_mem_ctrl = edge_mem_ctrl
diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc
index 38a8662ed0..ade95800d2 100644
--- a/src/accl/graph/base/base_reduce_engine.cc
+++ b/src/accl/graph/base/base_reduce_engine.cc
@@ -31,7 +31,7 @@
 namespace gem5
 {
 
-BaseReduceEngine::BaseReduceEngine(const BaseReduceEngineParams &params):
+BaseReduceEngine::BaseReduceEngine(const Params &params):
     ClockedObject(params),
     system(params.system),
     _requestorId(system->getRequestorId(this))
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
index c8c9784ed1..268bb60b76 100644
--- a/src/accl/graph/base/base_reduce_engine.hh
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -47,8 +47,7 @@ class BaseReduceEngine : public ClockedObject
 
   public:
     PARAMS(BaseReduceEngine);
-
-    BaseReduceEngine(const BaseReduceEngineParams &params);
+    BaseReduceEngine(const Params &params);
     ~BaseReduceEngine();
 
     RequestorID requestorId() { return _requestorId; }
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 830f1ecc16..6f775d8a38 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -78,15 +78,34 @@ struct __attribute__ ((packed)) Edge
         return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor);
     }
 
+    Edge(): weight(0), neighbor(0) {}
+
     Edge(uint16_t weight, uint64_t neighbor):
         weight(weight),
         neighbor(neighbor)
     {}
+
 };
 
 static_assert(isPowerOf2(sizeof(WorkListItem)));
 static_assert(isPowerOf2(sizeof(Edge)));
 
+struct CompleteEdge {
+    uint64_t src;
+    uint64_t dst;
+    uint32_t weight;
+
+    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight):
+        src(src), dst(dst), weight(weight)
+    {}
+
+    std::string to_string()
+    {
+        return csprintf("CompleteEdge{src: %lu, dst:%lu, weight: %u}",
+                                                    src, dst, weight);
+    }
+};
+
 template<typename T>
 class UniqueFIFO
 {
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 447731219e..a45f5d6ead 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -35,8 +35,7 @@ class PushEngine(BaseMemoryEngine):
     cxx_class = 'gem5::PushEngine'
 
     req_port  = RequestPort("Port to send updates to the outside")
-    base_edge_addr = Param.Addr("The base address for the "
-                                    "attached edge memory")
+
     push_req_queue_size = Param.Int("Size of the queue to "
                                     "queue push requests.")
     # resp_queue_size should probably be
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d87462d7dd..d071e8fd37 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -38,7 +38,6 @@ namespace gem5
 PushEngine::PushEngine(const Params &params):
     BaseMemoryEngine(params),
     reqPort(name() + ".req_port", this),
-    baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
     numTotalRetries(0), numPendingRetries(0),
     onTheFlyMemReqs(0),
@@ -140,12 +139,12 @@ PushEngine::recvWLItem(WorkListItem wl)
             "checking if there is enough push space. Use allocatePushSpace.\n");
 
     DPRINTF(PushEngine, "%s: Received %s.\n", __func__, wl.to_string());
-    Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
+    Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
     uint32_t value = wl.prop;
 
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                    peerMemoryAtomSize, value);
+                                    peerMemoryAtomSize, value, 0);
     DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
                             __func__, pushReqQueue.size());
 
@@ -162,12 +161,12 @@ PushEngine::recvWLItemRetry(WorkListItem wl)
     DPRINTF(PushEngine, "%s: Received %s with retry.\n",
                                 __func__, wl.to_string());
 
-    Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
+    Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
     uint32_t value = wl.prop;
 
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                    peerMemoryAtomSize, value);
+                                    peerMemoryAtomSize, value, 0);
     assert(pushReqQueue.size() <= pushReqQueueSize);
     DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
                             __func__, pushReqQueue.size());
@@ -191,22 +190,24 @@ PushEngine::processNextMemoryReadEvent()
         Addr aligned_addr, offset;
         int num_edges;
 
-        PushPacketInfoGen &curr_info = pushReqQueue.front();
+        EdgeReadInfoGen &curr_info = pushReqQueue.front();
         std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
         DPRINTF(PushEngine, "%s: Current packet information generated by "
-                    "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
+                    "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, "
                     "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
         PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
         reqOffsetMap[pkt->req] = offset;
         reqNumEdgeMap[pkt->req] = num_edges;
         reqValueMap[pkt->req] = curr_info.value();
+        PushInfo push_info = {curr_info.src(), curr_info.value(), offset, num_edges};
+        reqInfoMap[pkt->req] = push_info;
 
         memPort.sendPacket(pkt);
         onTheFlyMemReqs++;
 
         if (curr_info.done()) {
-            DPRINTF(PushEngine, "%s: Current PushPacketInfoGen is done.\n", __func__);
+            DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
             pushReqQueue.pop_front();
             DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                         "pushReqQueue.size() = %u.\n",
@@ -228,9 +229,6 @@ PushEngine::processNextMemoryReadEvent()
         }
     }
 
-    // if ((!nextMemoryReadEvent.scheduled()) && (!pushReqQueue.empty())) {
-        // schedule(nextMemoryReadEvent, nextCycle());
-    // }
     if (!pushReqQueue.empty()) {
         assert(!nextMemoryReadEvent.pending());
         assert(!nextMemoryReadEvent.scheduled());
@@ -265,6 +263,20 @@ PushEngine::handleMemResp(PacketPtr pkt)
     onTheFlyMemReqs--;
     assert(memRespQueue.size() <= memRespQueueSize);
 
+    uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize];
+    PushInfo push_info = reqInfoMap[pkt->req];
+    pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
+
+    std::vector<CompleteEdge> edges;
+    for (int i = 0; i < push_info.numElements; i++) {
+        Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
+        Addr edge_dst = edge->neighbor;
+        uint32_t edge_weight = edge->weight;
+        edges.emplace_back(push_info.src, edge_dst, edge_weight);
+    }
+    edgeQueue.push_back(edges);
+    delete pkt_data;
+
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
         schedule(nextPushEvent, nextCycle());
     }
@@ -288,6 +300,12 @@ PushEngine::processNextPushEvent()
 
     Edge* curr_edge = (Edge*) (data + offset);
 
+    std::vector<CompleteEdge>& current_edges = edgeQueue.front();
+    while(!current_edges.empty()) {
+        CompleteEdge curr_edge = current_edges.back();
+        DPRINTF(PushEngine, "%s: %s.\n", __func__, curr_edge.to_string());
+        current_edges.pop_back();
+    }
     // TODO: Implement propagate function here
     uint32_t update_value = value + 1;
     PacketPtr update = createUpdatePacket<uint32_t>(
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 9b182e2251..7fb6c42579 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -42,19 +42,21 @@ class CoalesceEngine;
 class PushEngine : public BaseMemoryEngine
 {
   private:
-    class PushPacketInfoGen {
+    class EdgeReadInfoGen {
       private:
         Addr _start;
         Addr _end;
         size_t _step;
         size_t _atom;
+
         uint32_t _value;
+        Addr _src;
 
       public:
-        PushPacketInfoGen(Addr start, Addr end, size_t step,
-                            size_t atom, uint32_t value):
-                        _start(start), _end(end), _step(step),
-                        _atom(atom), _value(value)
+        EdgeReadInfoGen(Addr start, Addr end, size_t step,
+                            size_t atom, uint32_t value, Addr src):
+                            _start(start), _end(end), _step(step),
+                            _atom(atom), _value(value), _src(src)
         {}
 
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
@@ -74,8 +76,17 @@ class PushEngine : public BaseMemoryEngine
             return std::make_tuple(aligned_addr, offset, num_items);
         }
 
-        uint32_t value() { return _value; }
         bool done() { return (_start >= _end); }
+
+        Addr src() { return _src; }
+        uint32_t value() { return _value; }
+    };
+
+    struct PushInfo {
+        Addr src;
+        uint32_t value;
+        Addr offset;
+        int numElements;
     };
 
     class ReqPort : public RequestPort
@@ -98,26 +109,27 @@ class PushEngine : public BaseMemoryEngine
         virtual void recvReqRetry();
     };
 
+    bool _running;
     int numElementsPerLine;
     CoalesceEngine* peerCoalesceEngine;
 
     ReqPort reqPort;
 
-    Addr baseEdgeAddr;
-
     int pushReqQueueSize;
     int numTotalRetries;
     int numPendingRetries;
-    std::deque<PushPacketInfoGen> pushReqQueue;
+    std::deque<EdgeReadInfoGen> pushReqQueue;
 
     // TODO: Add size one size for all these maps
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
+    std::unordered_map<RequestPtr, PushInfo> reqInfoMap;
 
     int onTheFlyMemReqs;
     int memRespQueueSize;
     std::deque<PacketPtr> memRespQueue;
+    std::deque<std::vector<CompleteEdge>> edgeQueue;
 
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
@@ -167,6 +179,11 @@ class PushEngine : public BaseMemoryEngine
 
     int getNumRetries() { return numTotalRetries; }
 
+    void start(); // CoalesceEngine announcing work
+    void stop(); // CoalesceEngine announcing no work
+    bool running() { return _running; }
+    void recvWLItem2(Addr addr, WorkListItem wl);
+
 };
 
 }

From 371f2b600c6b24ad2bdcb3f434284c06b22cff04 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 12 Aug 2022 08:32:42 -0700
Subject: [PATCH 143/247] Implemented pullVertex.

---
 configs/accl/sega.py                      |   7 +-
 src/accl/graph/base/data_structs.hh       |   5 +-
 src/accl/graph/sega/SConscript            |   1 +
 src/accl/graph/sega/base_memory_engine.cc |   8 +-
 src/accl/graph/sega/coalesce_engine.cc    |  71 +++---
 src/accl/graph/sega/coalesce_engine.hh    |   6 +-
 src/accl/graph/sega/push_engine.cc        | 257 +++++++++-------------
 src/accl/graph/sega/push_engine.hh        |  52 ++---
 8 files changed, 167 insertions(+), 240 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 26488ef69d..e7a704d477 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -6,10 +6,9 @@
 from m5.util.convert import toMemorySize
 
 class MPU(SubSystem):
-    def __init__(self, base_edge_addr):
+    def __init__(self):
         super(MPU, self).__init__()
-        self.push_engine = PushEngine(base_edge_addr=0,
-                                    push_req_queue_size=32,
+        self.push_engine = PushEngine(push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64)
         # self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
@@ -151,7 +150,7 @@ def __init__(self,
 
         mpus = []
         for i in range(num_mpus):
-            mpus.append(MPU(base_edge_addr=self.mem_ctrl.getEdgeBaseAddr(i)))
+            mpus.append(MPU())
             mpus[i].setReqPort(self.interconnect.cpu_side_ports)
             mpus[i].setRespPort(self.interconnect.mem_side_ports)
             mpus[i].setVertexMemPort(self.mem_ctrl.getVertexPort(i))
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 6f775d8a38..026a3cb7b2 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -94,9 +94,10 @@ struct CompleteEdge {
     uint64_t src;
     uint64_t dst;
     uint32_t weight;
+    uint32_t value;
 
-    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight):
-        src(src), dst(dst), weight(weight)
+    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value):
+        src(src), dst(dst), weight(weight), value(value)
     {}
 
     std::string to_string()
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 4c398b5ccd..ae216ccdd4 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -47,6 +47,7 @@ DebugFlag('CacheBlockState')
 DebugFlag('CoalesceEngine')
 DebugFlag('PushEngine')
 DebugFlag('SEGAStructureSize')
+DebugFlag('TempFlag')
 DebugFlag('WLEngine')
 
 CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index a5d1d7e8e7..9bd1941b23 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -99,11 +99,9 @@ BaseMemoryEngine::MemPort::recvReqRetry()
             "Received retry without a blockedPacket");
 
     _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
 }
 
 PacketPtr
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 21dd746aad..dcec2a5f78 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -47,8 +47,9 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr),
-    numRetriesReceived(0),
+    numMSHREntries(params.num_mshr_entry),
+    numTgtsPerMSHR(params.num_tgts_per_mshr),
+    _workCount(0), numPullsReceived(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -423,26 +424,20 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     "for addr %lu. It was not found in the cache.\n",
                     __func__, addr);
         WorkListItem* items = pkt->getPtr<WorkListItem>();
-        int push_needed = 0;
         // No applying of the line needed.
         DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                             __func__, needsPush.count());
-        assert(peerPushEngine->getNumRetries() == needsPush.count());
         for (int i = 0; i < numElementsPerLine; i++) {
-            assert(!((needsPush[it + i] == 1) &&
-                            (items[i].degree == 0)));
+            Addr vertex_addr = addr + i * sizeof(WorkListItem);
             if (needsPush[it + i] == 1) {
-                peerPushEngine->recvWLItemRetry(items[i]);
+                _workCount--;
+                needsPush[it + i] = 0;
+                peerPushEngine->recvVertexPush(vertex_addr, items[i]);
+                break;
             }
-            push_needed += needsPush[it + i];
-            needsPush[it + i] = 0;
         }
         DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                             __func__, needsPush.count());
-        peerPushEngine->deallocatePushSpace(
-                                numElementsPerLine - push_needed);
-        assert(peerPushEngine->getNumRetries() == needsPush.count());
-        // }
         delete pkt;
         return true;
     }
@@ -691,7 +686,7 @@ CoalesceEngine::processNextApplyEvent()
     DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
                 "cacheBlock[%d] to be applied.\n", __func__, block_index);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-            __func__, cacheBlocks[block_index].to_string());
+            __func__, block_index, cacheBlocks[block_index].to_string());
     assert(cacheBlocks[block_index].valid);
     assert(cacheBlocks[block_index].needsApply);
     assert(!cacheBlocks[block_index].pendingData);
@@ -712,14 +707,15 @@ CoalesceEngine::processNextApplyEvent()
 
                 int bit_index_base =
                             getBitIndexBase(cacheBlocks[block_index].addr);
-                if ((needsPush[bit_index_base + index] == 0) &&
-                    (cacheBlocks[block_index].items[index].degree != 0)) {
-                    if (peerPushEngine->allocatePushSpace()) {
-                        peerPushEngine->recvWLItem(
-                            cacheBlocks[block_index].items[index]);
-                    } else {
+
+                if (cacheBlocks[block_index].items[index].degree > 0) {
+                    if (needsPush[bit_index_base + index] == 0) {
+                        _workCount++;
                         needsPush[bit_index_base + index] = 1;
                     }
+                    if (!peerPushEngine->running()) {
+                        peerPushEngine->start();
+                    }
                 }
             }
         }
@@ -945,24 +941,20 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
             assert(cacheBlocks[block_index].valid);
             assert(cacheBlocks[block_index].busyMask == 0);
 
-            int push_needed = 0;
             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                                     __func__, needsPush.count());
-            assert(peerPushEngine->getNumRetries() == needsPush.count());
-
             for (int i = 0; i < numElementsPerLine; i++) {
+                Addr vertex_addr = addr + i * sizeof(WorkListItem);
                 if (needsPush[slice_base + i] == 1) {
-                    peerPushEngine->recvWLItemRetry(
-                        cacheBlocks[block_index].items[i]);
+                    _workCount--;
+                    needsPush[slice_base + i] = 0;
+                    peerPushEngine->recvVertexPush(vertex_addr,
+                                            cacheBlocks[block_index].items[i]);
+                    break;
                 }
-                push_needed +=  needsPush[slice_base + i];
-                needsPush[slice_base + i] = 0;
             }
             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                                     __func__, needsPush.count());
-            peerPushEngine->deallocatePushSpace(
-                                            numElementsPerLine - push_needed);
-            assert(peerPushEngine->getNumRetries() == needsPush.count());
         } else {
             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
             SenderState* sender_state = new SenderState(true);
@@ -973,11 +965,10 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
             // a flag to true (maybe not even needed just look if the cache has a
             // line allocated for it in the cacheBlocks).
         }
-        numRetriesReceived--;
-        assert(numRetriesReceived == 0);
+        numPullsReceived--;
     }
 
-    if (numRetriesReceived > 0) {
+    if (numPullsReceived > 0) {
         memoryFunctionQueue.emplace_back(
             [this] (int slice_base, Tick schedule_tick) {
             processNextPushRetry(slice_base, schedule_tick);
@@ -1002,29 +993,19 @@ CoalesceEngine::recvMemRetry()
 }
 
 void
-CoalesceEngine::recvPushRetry()
+CoalesceEngine::recvVertexPull()
 {
-    numRetriesReceived++;
-    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
-    // For now since we do only one retry at a time, we should not receive
-    // a retry while this nextSendingRetryEvent is scheduled or is pending.
-    assert(numRetriesReceived == 1);
-
-    // TODO: Pass slice_base to getOptimalBitVectorSlice
+    numPullsReceived++;
     memoryFunctionQueue.emplace_back(
         [this] (int slice_base, Tick schedule_tick) {
         processNextPushRetry(slice_base, schedule_tick);
     }, 0, curTick());
-    DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input 0 to "
-                                        "memoryFunctionQueue.\n", __func__);
     if ((!nextMemoryEvent.pending()) &&
         (!nextMemoryEvent.scheduled())) {
         schedule(nextMemoryEvent, nextCycle());
     }
 }
 
-
-
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index ce6e0daca6..6969fe2823 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -106,7 +106,8 @@ class CoalesceEngine : public BaseMemoryEngine
     std::unordered_map<int, std::vector<Addr>> MSHR;
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
-    int numRetriesReceived;
+    int _workCount;
+    int numPullsReceived;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
@@ -161,7 +162,8 @@ class CoalesceEngine : public BaseMemoryEngine
     void recvWLWrite(Addr addr, WorkListItem wl);
     void registerWLEngine(WLEngine* wl_engine);
 
-    void recvPushRetry();
+    int workCount() { return _workCount; }
+    void recvVertexPull();
 };
 
 }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d071e8fd37..b5341b3d61 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -30,6 +30,7 @@
 
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "debug/PushEngine.hh"
+#include "debug/TempFlag.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -38,13 +39,12 @@ namespace gem5
 PushEngine::PushEngine(const Params &params):
     BaseMemoryEngine(params),
     reqPort(name() + ".req_port", this),
-    pushReqQueueSize(params.push_req_queue_size),
-    numTotalRetries(0), numPendingRetries(0),
-    onTheFlyMemReqs(0),
-    memRespQueueSize(params.resp_queue_size),
+    _running(false),
+    numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
+    onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
+    nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
-    nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
     stats(*this)
 {}
 
@@ -66,15 +66,31 @@ PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine,
     numElementsPerLine = elements_per_line;
 }
 
+void
+PushEngine::recvReqRetry()
+{
+    DPRINTF(PushEngine, "%s: Received a req retry.\n", __func__);
+    if (nextPushEvent.pending()) {
+        nextPushEvent.wake();
+        schedule(nextPushEvent, nextCycle());
+    }
+}
+
 void
 PushEngine::ReqPort::sendPacket(PacketPtr pkt)
 {
     panic_if(_blocked, "Should never try to send if blocked MemSide!");
     // If we can't send the packet across the port, store it for later.
+    DPRINTF(PushEngine, "%s: Sending pakcet: %s to "
+                "the network.\n", __func__, pkt->print());
     if (!sendTimingReq(pkt))
     {
         blockedPacket = pkt;
         _blocked = true;
+        DPRINTF(PushEngine, "%s: MemPort blocked.\n", __func__);
+    } else {
+        DPRINTF(PushEngine, "%s: Packet sent successfully.\n", __func__);
+        owner->recvReqRetry();
     }
 }
 
@@ -92,86 +108,73 @@ PushEngine::ReqPort::recvReqRetry()
     DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__);
 
     _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!_blocked) {
-        blockedPacket = nullptr;
-        DPRINTF(PushEngine, "%s: Sent the blockedPacket. "
-                "_blocked: %s, (blockedPacket == nullptr): %s.\n",
-                __func__, _blocked ? "true" : "false",
-                (blockedPacket == nullptr) ? "true" : "false");
-    }
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+}
+
+bool
+PushEngine::vertexSpace()
+{
+    return (edgePointerQueueSize == 0) ||
+        ((edgePointerQueue.size() + numPendingPulls) < edgePointerQueueSize);
+}
+
+bool
+PushEngine::workLeft()
+{
+    return ((peerCoalesceEngine->workCount() - numPendingPulls) > 0);
 }
 
 void
-PushEngine::deallocatePushSpace(int space)
+PushEngine::start()
 {
-    /// DISCUSS: Might have to check whether the addrGenEvent is scheduled
-    // and or the pushReqQueue is empty. If so we might need to
-    // send retries.
-    DPRINTF(PushEngine, "%s: Received reported %d free spaces.\n",
-                                                __func__, space);
-    numPendingRetries--;
-    if (numTotalRetries > 0) {
-        int free_space = pushReqQueueSize -
-            (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
-        DPRINTF(PushEngine, "%s: pushReqQueue has at least %d "
-                            "free spaces.\n", __func__, free_space);
-        if ((free_space >= numElementsPerLine) &&
-            (numPendingRetries == 0)) {
-            DPRINTF(PushEngine, "%s: Sent a push retry to "
-                            "peerCoalesceEngine.\n", __func__);
-            assert(!nextSendRetryEvent.scheduled());
-            schedule(nextSendRetryEvent, nextCycle());
-        }
+    assert(!_running);
+    assert(!nextVertexPullEvent.scheduled());
+
+    _running = true;
+    // NOTE: We might have to check for size availability here.
+    assert(workLeft());
+    if (vertexSpace()) {
+        schedule(nextVertexPullEvent, nextCycle());
     }
 }
 
 void
-PushEngine::recvWLItem(WorkListItem wl)
+PushEngine::processNextVertexPullEvent()
 {
-    assert(wl.degree != 0);
-
-    assert((pushReqQueueSize == 0) ||
-        (pushReqQueue.size() < pushReqQueueSize));
-    panic_if((pushReqQueue.size() == pushReqQueueSize) &&
-            (pushReqQueueSize != 0), "You should call this method after "
-            "checking if there is enough push space. Use allocatePushSpace.\n");
+    // TODO: change edgePointerQueueSize
+    numPendingPulls++;
+    peerCoalesceEngine->recvVertexPull();
 
-    DPRINTF(PushEngine, "%s: Received %s.\n", __func__, wl.to_string());
-    Addr start_addr = wl.edgeIndex * sizeof(Edge);
-    Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
-    uint32_t value = wl.prop;
-
-    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                    peerMemoryAtomSize, value, 0);
-    DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
-                            __func__, pushReqQueue.size());
+    if (!workLeft()) {
+        _running = false;
+    }
 
-    if ((!nextMemoryReadEvent.pending()) &&
-        (!nextMemoryReadEvent.scheduled())) {
-        schedule(nextMemoryReadEvent, nextCycle());
+    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
     }
 }
 
 void
-PushEngine::recvWLItemRetry(WorkListItem wl)
+PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
 {
-    assert(wl.degree != 0);
-    DPRINTF(PushEngine, "%s: Received %s with retry.\n",
-                                __func__, wl.to_string());
+    assert(wl.degree > 0);
+    assert((edgePointerQueueSize == 0) ||
+            ((edgePointerQueue.size() + numPendingPulls) <= edgePointerQueueSize));
 
     Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
-    uint32_t value = wl.prop;
 
-    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                    peerMemoryAtomSize, value, 0);
-    assert(pushReqQueue.size() <= pushReqQueueSize);
-    DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
-                            __func__, pushReqQueue.size());
+    edgePointerQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                        peerMemoryAtomSize, addr, (uint32_t) wl.prop);
+    numPendingPulls--;
+    DPRINTF(TempFlag, "%s: Received {addr: %lu, wl: %s}.\n",
+                            __func__, addr, wl.to_string());
+    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
 
-    numTotalRetries--;
     if ((!nextMemoryReadEvent.pending()) &&
         (!nextMemoryReadEvent.scheduled())) {
         schedule(nextMemoryReadEvent, nextCycle());
@@ -186,20 +189,17 @@ PushEngine::processNextMemoryReadEvent()
         return;
     }
 
-    if (memRespQueue.size() < (memRespQueueSize - onTheFlyMemReqs)) {
+    if (edgeQueue.size() < (edgeQueueSize - onTheFlyMemReqs)) {
         Addr aligned_addr, offset;
         int num_edges;
 
-        EdgeReadInfoGen &curr_info = pushReqQueue.front();
+        EdgeReadInfoGen &curr_info = edgePointerQueue.front();
         std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
         DPRINTF(PushEngine, "%s: Current packet information generated by "
                     "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, "
                     "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
         PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
-        reqOffsetMap[pkt->req] = offset;
-        reqNumEdgeMap[pkt->req] = num_edges;
-        reqValueMap[pkt->req] = curr_info.value();
         PushInfo push_info = {curr_info.src(), curr_info.value(), offset, num_edges};
         reqInfoMap[pkt->req] = push_info;
 
@@ -208,42 +208,23 @@ PushEngine::processNextMemoryReadEvent()
 
         if (curr_info.done()) {
             DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
-            pushReqQueue.pop_front();
-            DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
-                        "pushReqQueue.size() = %u.\n",
-                        __func__, pushReqQueue.size());
-            if (numTotalRetries > 0) {
-                int free_space = pushReqQueueSize -
-                (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
-                DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
-                            " free spaces.\n", __func__, free_space);
-                if ((free_space >= numElementsPerLine) &&
-                    (numPendingRetries == 0)) {
-                    DPRINTF(PushEngine, "%s: Sent a push retry to "
-                                "peerCoalesceEngine.\n", __func__);
-                    if (!nextSendRetryEvent.scheduled()) {
-                        schedule(nextSendRetryEvent, nextCycle());
-                    }
-                }
-            }
+            edgePointerQueue.pop_front();
+            DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. "
+            "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size());
         }
     }
 
-    if (!pushReqQueue.empty()) {
+    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
+
+    if (!edgePointerQueue.empty()) {
         assert(!nextMemoryReadEvent.pending());
         assert(!nextMemoryReadEvent.scheduled());
         schedule(nextMemoryReadEvent, nextCycle());
     }
 }
 
-void
-PushEngine::processNextSendRetryEvent()
-{
-    assert(numPendingRetries == 0);
-    numPendingRetries++;
-    peerCoalesceEngine->recvPushRetry();
-}
-
 void
 PushEngine::recvMemRetry()
 {
@@ -259,25 +240,27 @@ PushEngine::handleMemResp(PacketPtr pkt)
 {
     // TODO: in case we need to edit edges, get rid of second statement.
     assert(pkt->isResponse() && (!pkt->isWrite()));
-    memRespQueue.push_back(pkt);
-    onTheFlyMemReqs--;
-    assert(memRespQueue.size() <= memRespQueueSize);
 
     uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize];
     PushInfo push_info = reqInfoMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
 
-    std::vector<CompleteEdge> edges;
+    std::deque<CompleteEdge> edges;
     for (int i = 0; i < push_info.numElements; i++) {
         Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
         Addr edge_dst = edge->neighbor;
         uint32_t edge_weight = edge->weight;
-        edges.emplace_back(push_info.src, edge_dst, edge_weight);
+        edges.emplace_back(push_info.src, edge_dst,
+                    edge_weight, push_info.value);
     }
     edgeQueue.push_back(edges);
+    onTheFlyMemReqs--;
+    reqInfoMap.erase(pkt->req);
     delete pkt_data;
+    delete pkt;
 
-    if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
+    if ((!nextPushEvent.pending()) &&
+        (!nextPushEvent.scheduled())) {
         schedule(nextPushEvent, nextCycle());
     }
     return true;
@@ -287,50 +270,37 @@ PushEngine::handleMemResp(PacketPtr pkt)
 void
 PushEngine::processNextPushEvent()
 {
-    PacketPtr pkt = memRespQueue.front();
-    uint8_t* data = pkt->getPtr<uint8_t>();
-
-    Addr offset = reqOffsetMap[pkt->req];
-    assert(offset < peerMemoryAtomSize);
-    uint32_t value = reqValueMap[pkt->req];
+    if (reqPort.blocked()) {
+        nextPushEvent.sleep();
+        return;
+    }
 
-    DPRINTF(PushEngine, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
-                "offset: %lu\n",
-            __func__, pkt->getAddr(), offset);
+    std::deque<CompleteEdge>& edge_list = edgeQueue.front();
+    CompleteEdge curr_edge = edge_list.front();
 
-    Edge* curr_edge = (Edge*) (data + offset);
+    DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
+                    __func__, curr_edge.to_string());
 
-    std::vector<CompleteEdge>& current_edges = edgeQueue.front();
-    while(!current_edges.empty()) {
-        CompleteEdge curr_edge = current_edges.back();
-        DPRINTF(PushEngine, "%s: %s.\n", __func__, curr_edge.to_string());
-        current_edges.pop_back();
-    }
     // TODO: Implement propagate function here
-    uint32_t update_value = value + 1;
+    uint32_t update_value = curr_edge.value + 1;
     PacketPtr update = createUpdatePacket<uint32_t>(
-                            curr_edge->neighbor, update_value);
-
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(update);
-        stats.numUpdates++;
-        DPRINTF(PushEngine, "%s: Sent a push update to addr: %lu with value: %d.\n",
-                                __func__, curr_edge->neighbor, update_value);
-        reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
-        assert(reqOffsetMap[pkt->req] <= peerMemoryAtomSize);
-        reqNumEdgeMap[pkt->req]--;
-        assert(reqNumEdgeMap[pkt->req] >= 0);
-    }
+                            curr_edge.dst, update_value);
+
+    reqPort.sendPacket(update);
+    stats.numUpdates++;
+    DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to addr: %lu "
+                        "with value: %d.\n", __func__, curr_edge.src,
+                        curr_edge.dst, update_value);
+
 
-    if (reqNumEdgeMap[pkt->req] == 0) {
-        reqOffsetMap.erase(pkt->req);
-        reqNumEdgeMap.erase(pkt->req);
-        reqValueMap.erase(pkt->req);
-        memRespQueue.pop_front();
-        delete pkt;
+    edge_list.pop_front();
+    if (edge_list.empty()) {
+        edgeQueue.pop_front();
     }
 
-    if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
+    assert(!nextPushEvent.pending());
+    assert(!nextPushEvent.scheduled());
+    if (!edgeQueue.empty()) {
         schedule(nextPushEvent, nextCycle());
     }
 }
@@ -354,17 +324,6 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
-bool
-PushEngine::allocatePushSpace() {
-    if ((pushReqQueueSize == 0) ||
-        ((pushReqQueue.size() < pushReqQueueSize) && (numTotalRetries == 0))) {
-        return true;
-    } else {
-        numTotalRetries++;
-        return false;
-    }
-}
-
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 7fb6c42579..c79b0de944 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -49,14 +49,14 @@ class PushEngine : public BaseMemoryEngine
         size_t _step;
         size_t _atom;
 
-        uint32_t _value;
         Addr _src;
+        uint32_t _value;
 
       public:
         EdgeReadInfoGen(Addr start, Addr end, size_t step,
-                            size_t atom, uint32_t value, Addr src):
+                            size_t atom, Addr src, uint32_t value):
                             _start(start), _end(end), _step(step),
-                            _atom(atom), _value(value), _src(src)
+                            _atom(atom), _src(src), _value(value)
         {}
 
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
@@ -109,38 +109,34 @@ class PushEngine : public BaseMemoryEngine
         virtual void recvReqRetry();
     };
 
+    ReqPort reqPort;
+
     bool _running;
     int numElementsPerLine;
     CoalesceEngine* peerCoalesceEngine;
 
-    ReqPort reqPort;
-
-    int pushReqQueueSize;
-    int numTotalRetries;
-    int numPendingRetries;
-    std::deque<EdgeReadInfoGen> pushReqQueue;
-
-    // TODO: Add size one size for all these maps
-    std::unordered_map<RequestPtr, Addr> reqOffsetMap;
-    std::unordered_map<RequestPtr, int> reqNumEdgeMap;
-    std::unordered_map<RequestPtr, uint32_t> reqValueMap;
+    int numPendingPulls;
+    int edgePointerQueueSize;
+    std::deque<EdgeReadInfoGen> edgePointerQueue;
     std::unordered_map<RequestPtr, PushInfo> reqInfoMap;
 
     int onTheFlyMemReqs;
-    int memRespQueueSize;
-    std::deque<PacketPtr> memRespQueue;
-    std::deque<std::vector<CompleteEdge>> edgeQueue;
+    int edgeQueueSize;
+    std::deque<std::deque<CompleteEdge>> edgeQueue;
 
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
+    EventFunctionWrapper nextVertexPullEvent;
+    void processNextVertexPullEvent();
+
     MemoryEvent nextMemoryReadEvent;
     void processNextMemoryReadEvent();
 
-    EventFunctionWrapper nextPushEvent;
+    MemoryEvent nextPushEvent;
     void processNextPushEvent();
 
-    EventFunctionWrapper nextSendRetryEvent;
-    void processNextSendRetryEvent();
+    bool vertexSpace();
+    bool workLeft();
 
     struct PushStats : public statistics::Group
     {
@@ -166,24 +162,14 @@ class PushEngine : public BaseMemoryEngine
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
-    bool allocatePushSpace();
-
-    void deallocatePushSpace(int space);
-
-    void recvWLItem(WorkListItem wl);
-
-    void recvWLItemRetry(WorkListItem wl);
-
     void registerCoalesceEngine(CoalesceEngine* coalesce_engine,
                                           int elements_per_line);
 
-    int getNumRetries() { return numTotalRetries; }
+    void recvReqRetry();
 
-    void start(); // CoalesceEngine announcing work
-    void stop(); // CoalesceEngine announcing no work
+    void start();
     bool running() { return _running; }
-    void recvWLItem2(Addr addr, WorkListItem wl);
-
+    void recvVertexPush(Addr addr, WorkListItem wl);
 };
 
 }

From 34d8bcef6633e9019c3fd4d3921044eb5bebedeb Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 22 Aug 2022 11:51:06 -0700
Subject: [PATCH 144/247] Added sim exit functionality. WIP

---
 src/accl/graph/sega/centeral_controller.hh |  2 +-
 src/accl/graph/sega/coalesce_engine.cc     |  7 +++++++
 src/accl/graph/sega/coalesce_engine.hh     |  2 ++
 src/accl/graph/sega/push_engine.cc         | 11 +++++++++++
 src/accl/graph/sega/push_engine.hh         |  8 +++++---
 src/accl/graph/sega/wl_engine.cc           |  6 ++++++
 src/accl/graph/sega/wl_engine.hh           |  3 ++-
 7 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 102800de92..1f325703bd 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -52,7 +52,7 @@ class CenteralController : public ClockedObject
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
-        // virtual AddrRangeList getAddrRanges() const;
+
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index dcec2a5f78..57bc99013c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -85,6 +85,13 @@ CoalesceEngine::drain()
     return DrainState::Drained;
 }
 
+bool
+CoalesceEngine::done()
+{
+    return needsPush.none() &&
+        memoryFunctionQueue.empty() && peerWLEngine->done();
+}
+
 // addr should be aligned to peerMemoryAtomSize
 int
 CoalesceEngine::getBlockIndex(Addr addr)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 6969fe2823..b19a1bc461 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -164,6 +164,8 @@ class CoalesceEngine : public BaseMemoryEngine
 
     int workCount() { return _workCount; }
     void recvVertexPull();
+
+    bool done();
 };
 
 }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index b5341b3d61..9866c30f5c 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -32,6 +32,7 @@
 #include "debug/PushEngine.hh"
 #include "debug/TempFlag.hh"
 #include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
 
 namespace gem5
 {
@@ -126,6 +127,12 @@ PushEngine::workLeft()
     return ((peerCoalesceEngine->workCount() - numPendingPulls) > 0);
 }
 
+bool
+PushEngine::done()
+{
+    return edgeQueue.empty() &&
+        edgePointerQueue.empty() && peerCoalesceEngine->done();
+}
 void
 PushEngine::start()
 {
@@ -298,6 +305,10 @@ PushEngine::processNextPushEvent()
         edgeQueue.pop_front();
     }
 
+    if (done()) {
+        exitSimLoopNow(name() + " is done.");
+    }
+
     assert(!nextPushEvent.pending());
     assert(!nextPushEvent.scheduled());
     if (!edgeQueue.empty()) {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index c79b0de944..a42228f4c0 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -126,6 +126,9 @@ class PushEngine : public BaseMemoryEngine
 
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
+    bool vertexSpace();
+    bool workLeft();
+
     EventFunctionWrapper nextVertexPullEvent;
     void processNextVertexPullEvent();
 
@@ -135,9 +138,6 @@ class PushEngine : public BaseMemoryEngine
     MemoryEvent nextPushEvent;
     void processNextPushEvent();
 
-    bool vertexSpace();
-    bool workLeft();
-
     struct PushStats : public statistics::Group
     {
       PushStats(PushEngine &push);
@@ -170,6 +170,8 @@ class PushEngine : public BaseMemoryEngine
     void start();
     bool running() { return _running; }
     void recvVertexPush(Addr addr, WorkListItem wl);
+
+    bool done();
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 12f4548aa2..e999667ad1 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -121,6 +121,12 @@ WLEngine::getAddrRanges() const
     return coalesceEngine->getAddrRanges();
 }
 
+bool
+WLEngine::done()
+{
+    return registerFile.empty() && updateQueue.empty();
+}
+
 // TODO: Parameterize the number of pops WLEngine can do at a time.
 // TODO: Add a histogram stats of the size of the updateQueue. Sample here.
 void
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 5e8e5b25f3..1360d37132 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -80,7 +80,6 @@ class WLEngine : public BaseReduceEngine
     std::unordered_map<Addr, WorkListItem> workListFile;
 
     void recvFunctional(PacketPtr pkt);
-
     AddrRangeList getAddrRanges() const;
 
     EventFunctionWrapper nextReadEvent;
@@ -116,6 +115,8 @@ class WLEngine : public BaseReduceEngine
     void handleIncomingWL(Addr addr, WorkListItem wl);
 
     int getRegisterFileSize() { return registerFileSize; }
+
+    bool done();
 };
 
 }

From 72cdfa6b3a53b4aaf0447b6a2ff3d7877b68abf1 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 26 Aug 2022 09:54:35 -0700
Subject: [PATCH 145/247] Adding a DDR model to the accelerator

---
 configs/accl/sega.py   | 45 +++++++++++++++++++++++++++++-------------
 src/base/statistics.hh |  2 +-
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index e7a704d477..28f9211045 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -2,6 +2,7 @@
 import argparse
 
 from math import log
+import math
 from m5.objects import *
 from m5.util.convert import toMemorySize
 
@@ -18,7 +19,7 @@ def __init__(self):
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
-                                    cache_size="8MiB",
+                                    cache_size="16MiB",
                                     num_mshr_entry=32,
                                     num_tgts_per_mshr=16)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
@@ -61,7 +62,7 @@ def __init__(self,
 
         self._edge_chunk_size = int(\
                                 toMemorySize(edge_memory_size)/num_channels)
-        self._edge_ranges = [AddrRange(\
+        self._edge_ranges = [AddrRange(
                             start=toMemorySize(vertex_memory_size)+\
                             self._edge_chunk_size*i,\
                             size=self._edge_chunk_size)\
@@ -69,23 +70,39 @@ def __init__(self,
 
         vertex_mem_ctrl = []
         edge_mem_ctrl = []
+        # vertex_mem_ranges = self._vertex_ranges
+        
+
         for i in range(num_channels):
+            # vertex_addr_range = vertex_mem_ranges[i]
+            vertex_interface = DDR4_2400_8x8()
+            vertex_interface.range = self._vertex_ranges[i]
+            ctrl = MemCtrl()
+            ctrl.dram = vertex_interface
             vertex_mem_ctrl.append(
-                SimpleMemory(range=self._vertex_ranges[i],
-                            bandwidth="19.2GB/s",
-                            latency="30ns")
+                ctrl
             )
+
+            edge_interface = DDR4_2400_8x8(
+                image_file = f"{graph_path}/edgelist_{i}", in_addr_map=False)
+            edge_interface.range = AddrRange(self._edge_chunk_size)
+            #                 start=toMemorySize(vertex_memory_size)+\
+            #                 self._edge_chunk_size*i,\
+            #                 size=self._edge_chunk_size)
+            # edge_addr_range = edge_mem_range[0]
+            # edge_interface.range = self._edge_chunk_size
+            edge_ctrl = MemCtrl()
+            edge_ctrl.dram = edge_interface
             edge_mem_ctrl.append(
-                # SimpleMemory(range=self._edge_ranges[i],
-                #             bandwidth="4.8GB/s",
-                #             latency="30ns",
-                #             image_file=f"{graph_path}/edgelist_{i}")
-                SimpleMemory(range=AddrRange(self._edge_chunk_size),
-                            bandwidth="4.8GB/s",
-                            latency="30ns",
-                            image_file=f"{graph_path}/edgelist_{i}",
-                            in_addr_map=False)
+                edge_ctrl
             )
+            # edge_mem_ctrl.append(
+            #     SimpleMemory(range=AddrRange(self._edge_chunk_size),
+            #                 bandwidth="4.8GB/s",
+            #                 latency="30ns",
+            #                 image_file=f"{graph_path}/edgelist_{i}",
+            #                 in_addr_map=False)
+            # )
         self.vertex_mem_ctrl = vertex_mem_ctrl
         self.edge_mem_ctrl = edge_mem_ctrl
 
diff --git a/src/base/statistics.hh b/src/base/statistics.hh
index 24cbf714f5..15aeff892e 100644
--- a/src/base/statistics.hh
+++ b/src/base/statistics.hh
@@ -1052,7 +1052,7 @@ class VectorBase : public DataWrapVec<Derived, VectorInfoProxy>
     Proxy
     operator[](off_type index)
     {
-        assert (index < size());
+        // assert (index < size());
         return Proxy(this->self(), index);
     }
 };

From 6d0c4011086f1a9c644accc96943fd2026bba3d2 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 28 Aug 2022 21:14:54 -0700
Subject: [PATCH 146/247] Completed sim exit. I think...

---
 configs/accl/sega.py                       | 184 ++++++------------
 src/accl/graph/sega/CenteralController.py  |   6 +-
 src/accl/graph/sega/CoalesceEngine.py      |   3 -
 src/accl/graph/sega/MPU.py                 |  47 +++++
 src/accl/graph/sega/PushEngine.py          |   2 -
 src/accl/graph/sega/SConscript             |   2 +
 src/accl/graph/sega/WLEngine.py            |   3 -
 src/accl/graph/sega/centeral_controller.cc |  23 ++-
 src/accl/graph/sega/centeral_controller.hh |  13 +-
 src/accl/graph/sega/coalesce_engine.cc     |  78 ++++----
 src/accl/graph/sega/coalesce_engine.hh     |  11 +-
 src/accl/graph/sega/mpu.cc                 | 206 +++++++++++++++++++++
 src/accl/graph/sega/mpu.hh                 | 135 ++++++++++++++
 src/accl/graph/sega/push_engine.cc         |  73 ++------
 src/accl/graph/sega/push_engine.hh         |  38 +---
 src/accl/graph/sega/wl_engine.cc           | 133 ++++---------
 src/accl/graph/sega/wl_engine.hh           |  43 +----
 17 files changed, 573 insertions(+), 427 deletions(-)
 create mode 100644 src/accl/graph/sega/MPU.py
 create mode 100644 src/accl/graph/sega/mpu.cc
 create mode 100644 src/accl/graph/sega/mpu.hh

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 28f9211045..a0bfb5ddce 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -4,112 +4,8 @@
 from math import log
 import math
 from m5.objects import *
-from m5.util.convert import toMemorySize
 
-class MPU(SubSystem):
-    def __init__(self):
-        super(MPU, self).__init__()
-        self.push_engine = PushEngine(push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=64)
-        # self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-        #                             push_req_queue_size=32,
-        #                             attached_memory_atom_size=64,
-        #                             resp_queue_size=64)
-        self.coalesce_engine = CoalesceEngine(
-                                    peer_push_engine=self.push_engine,
-                                    attached_memory_atom_size=32,
-                                    cache_size="16MiB",
-                                    num_mshr_entry=32,
-                                    num_tgts_per_mshr=16)
-        self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=64,
-                                register_file_size=32)
-
-    def getRespPort(self):
-        return self.wl_engine.resp_port
-    def setRespPort(self, port):
-        self.wl_engine.resp_port = port
-
-    def getReqPort(self):
-        return self.push_engine.req_port
-    def setReqPort(self, port):
-        self.push_engine.req_port = port
-
-    def getVertexMemPort(self):
-        return self.coalesce_engine.mem_port
-    def setVertexMemPort(self, port):
-        self.coalesce_engine.mem_port = port
-
-    def getEdgeMemPort(self):
-        return self.push_engine.mem_port
-    def setEdgeMemPort(self, port):
-        self.push_engine.mem_port = port
-
-class MPUMemory(SubSystem):
-    def __init__(self,
-                    num_channels: int,
-                    cache_line_size: int,
-                    vertex_memory_size: str,
-                    edge_memory_size: str,
-                    graph_path: str):
-        super(MPUMemory, self).__init__()
-
-        self._vertex_ranges = self._interleave_addresses(
-                                AddrRange(start=0, size=vertex_memory_size),\
-                                num_channels,\
-                                cache_line_size)
-
-        self._edge_chunk_size = int(\
-                                toMemorySize(edge_memory_size)/num_channels)
-        self._edge_ranges = [AddrRange(
-                            start=toMemorySize(vertex_memory_size)+\
-                            self._edge_chunk_size*i,\
-                            size=self._edge_chunk_size)\
-                            for i in range(num_channels)]
-
-        vertex_mem_ctrl = []
-        edge_mem_ctrl = []
-        # vertex_mem_ranges = self._vertex_ranges
-        
-
-        for i in range(num_channels):
-            # vertex_addr_range = vertex_mem_ranges[i]
-            vertex_interface = DDR4_2400_8x8()
-            vertex_interface.range = self._vertex_ranges[i]
-            ctrl = MemCtrl()
-            ctrl.dram = vertex_interface
-            vertex_mem_ctrl.append(
-                ctrl
-            )
-
-            edge_interface = DDR4_2400_8x8(
-                image_file = f"{graph_path}/edgelist_{i}", in_addr_map=False)
-            edge_interface.range = AddrRange(self._edge_chunk_size)
-            #                 start=toMemorySize(vertex_memory_size)+\
-            #                 self._edge_chunk_size*i,\
-            #                 size=self._edge_chunk_size)
-            # edge_addr_range = edge_mem_range[0]
-            # edge_interface.range = self._edge_chunk_size
-            edge_ctrl = MemCtrl()
-            edge_ctrl.dram = edge_interface
-            edge_mem_ctrl.append(
-                edge_ctrl
-            )
-            # edge_mem_ctrl.append(
-            #     SimpleMemory(range=AddrRange(self._edge_chunk_size),
-            #                 bandwidth="4.8GB/s",
-            #                 latency="30ns",
-            #                 image_file=f"{graph_path}/edgelist_{i}",
-            #                 in_addr_map=False)
-            # )
-        self.vertex_mem_ctrl = vertex_mem_ctrl
-        self.edge_mem_ctrl = edge_mem_ctrl
-
-    def _interleave_addresses(self,
-                            plain_range,
-                            num_channels,
-                            cache_line_size):
+def interleave_addresses(plain_range, num_channels, cache_line_size):
         intlv_low_bit = log(cache_line_size, 2)
         intlv_bits = log(num_channels, 2)
         ret = []
@@ -123,17 +19,48 @@ def _interleave_addresses(self,
                 intlvMatch=i))
         return ret
 
-    def getVertexPort(self, i):
-        return self.vertex_mem_ctrl[i].port
-    def setVertexPort(self, port, i):
-        self.vertex_mem_ctrl[i].port = port
+class GPT(SubSystem):
+    def __init__(self, edge_memory_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(update_queue_size=64,
+                                register_file_size=32)
+        self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
+                                            cache_size="8MiB",
+                                            num_mshr_entry=32,
+                                            num_tgts_per_mshr=16)
+        self.push_engine = PushEngine(push_req_queue_size=32,
+                                    attached_memory_atom_size=64,
+                                    resp_queue_size=64)
+        self.vertex_mem_ctrl = SimpleMemory(latency="30ns",
+                                            latency_var="0ns",
+                                            bandwidth="19.2GiB/s")
+        self.edge_mem_ctrl = SimpleMemory(latency="30ns",
+                                        latency_var="0ns",
+                                        bandwidth="19.2GiB/s",
+                                        range=AddrRange(edge_memory_size),
+                                        in_addr_map=False)
+
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+        self.push_engine.mem_port = self.edge_mem_ctrl.port
+
+        self.mpu = MPU(wl_engine=self.wl_engine,
+                    coalesce_engine=self.coalesce_engine,
+                    push_engine=self.push_engine)
 
-    def getEdgeBaseAddr(self, i):
-        return self._edge_ranges[i].start
-    def getEdgePort(self, i):
-        return self.edge_mem_ctrl[i].port
-    def setEdgePort(self, port, i):
-        self.edge_mem_ctrl[i].port = port
+    def getRespPort(self):
+        return self.mpu.in_port
+    def setRespPort(self, port):
+        self.mpu.in_port = port
+
+    def getReqPort(self):
+        return self.mpu.out_port
+    def setReqPort(self, port):
+        self.mpu.out_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+    def set_edge_image(self, edge_image):
+        self.edge_mem_ctrl.image_file = edge_image
 
 class SEGA(System):
     def __init__(self,
@@ -158,21 +85,19 @@ def __init__(self,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
-        self.mem_ctrl = MPUMemory(
-                            num_mpus,
-                            self.cache_line_size,
-                            "2GiB",
-                            "14GiB",
-                            graph_path)
+        vertex_ranges = interleave_addresses(AddrRange("4GiB"), num_mpus, vertex_cache_line_size)
 
-        mpus = []
+        gpts = []
         for i in range(num_mpus):
-            mpus.append(MPU())
-            mpus[i].setReqPort(self.interconnect.cpu_side_ports)
-            mpus[i].setRespPort(self.interconnect.mem_side_ports)
-            mpus[i].setVertexMemPort(self.mem_ctrl.getVertexPort(i))
-            mpus[i].setEdgeMemPort(self.mem_ctrl.getEdgePort(i))
-        self.mpu = mpus
+            gpt = GPT("8GiB")
+            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
+            gpt.setReqPort(self.interconnect.cpu_side_ports)
+            gpt.setRespPort(self.interconnect.mem_side_ports)
+            gpts.append(gpt)
+        self.gpts = gpts
+
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
 def get_inputs():
     argparser = argparse.ArgumentParser()
@@ -197,5 +122,4 @@ def get_inputs():
     m5.instantiate()
 
     exit_event = m5.simulate()
-    print(f"Exited simulation because {exit_event.getCause()}")
-    exit()
+    print(f"Exited simulation at tick {m5.curTick()} because {exit_event.getCause()}")
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index bd2f6320a8..6f6b12ea2c 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -36,7 +36,9 @@ class CenteralController(ClockedObject):
 
     system = Param.System(Parent.any, "System this Engine is a part of")
     req_port  = RequestPort("Port to send updates to the outside")
-    addr = Param.Addr("")
-    value = Param.Int(0, "")
 
+    mpu_vector = VectorParam.MPU("All mpus in the system.")
+
+    addr = Param.Addr("The addr for the initial update")
+    value = Param.Int("The value for the initial update")
     image_file = Param.String("Path to the global memory image.")
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 06c6f92750..14902ef352 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -34,9 +34,6 @@ class CoalesceEngine(BaseMemoryEngine):
     cxx_header = "accl/graph/sega/coalesce_engine.hh"
     cxx_class = 'gem5::CoalesceEngine'
 
-    peer_push_engine = Param.PushEngine(NULL, "PushEngine in the same GPT.")
-
     cache_size = Param.MemorySize("Size of the internal SRAM array.")
-
     num_mshr_entry = Param.Int("Number of MSHR entries.")
     num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
new file mode 100644
index 0000000000..2d65be2949
--- /dev/null
+++ b/src/accl/graph/sega/MPU.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.SimObject import SimObject
+
+class MPU(SimObject):
+    type = "MPU"
+    cxx_header = "accl/graph/sega/mpu.hh"
+    cxx_class = "gem5::MPU"
+
+    system = Param.System(Parent.any, "System this MPU is a part of")
+
+    in_port = ResponsePort("Port to receive updates from outside")
+    out_port  = RequestPort("Port to send updates to the outside")
+
+    wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
+                                "MPU object.")
+    coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for "
+                                "each instance of MPU object.")
+    push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
+                                "instance of MPU object.")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index a45f5d6ead..f98f22ba9d 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -34,8 +34,6 @@ class PushEngine(BaseMemoryEngine):
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
-    req_port  = RequestPort("Port to send updates to the outside")
-
     push_req_queue_size = Param.Int("Size of the queue to "
                                     "queue push requests.")
     # resp_queue_size should probably be
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index ae216ccdd4..42a8d84ad5 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -30,12 +30,14 @@ Import('*')
 SimObject('BaseMemoryEngine.py')
 SimObject('CenteralController.py')
 SimObject('CoalesceEngine.py')
+SimObject("MPU.py")
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
 Source('base_memory_engine.cc')
 Source('centeral_controller.cc')
 Source('coalesce_engine.cc')
+Source("mpu.cc")
 Source('push_engine.cc')
 Source('wl_engine.cc')
 
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 98089328f4..52ca031260 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -34,9 +34,6 @@ class WLEngine(BaseReduceEngine):
     cxx_header = "accl/graph/sega/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
-    resp_port = ResponsePort("Port to Receive updates from outside")
-    coalesce_engine = Param.CoalesceEngine(NULL, "The CoalesceEngine "
-                                    "this WLEngine is connected to.")
     update_queue_size = Param.Int("Size of the queue WLEngine stores "
                                         "the incoming updates")
     register_file_size = Param.Int("Number of internal registers the "
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index f19c93ebac..5ce7228abb 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -28,10 +28,13 @@
 
 #include "accl/graph/sega/centeral_controller.hh"
 
+#include <iostream>
+
 #include "base/loader/memory_image.hh"
 #include "base/loader/object_file.hh"
 #include "debug/CenteralController.hh"
 #include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
 
 namespace gem5
 {
@@ -43,7 +46,12 @@ CenteralController::CenteralController
     reqPort(name() + ".req_port", this),
     addr(params.addr),
     value(params.value)
-{}
+{
+    for (auto mpu : params.mpu_vector) {
+        mpuVector.push_back(mpu);
+        mpu->registerCenteralController(this);
+    }
+}
 
 Port&
 CenteralController::getPort(const std::string &if_name, PortID idx)
@@ -143,4 +151,17 @@ CenteralController::functionalAccess(PacketPtr pkt)
     reqPort.sendFunctional(pkt);
 }
 
+void
+CenteralController::recvDoneSignal()
+{
+    bool done = true;
+    for (auto mpu : mpuVector) {
+        done &= mpu->done();
+    }
+
+    if (done) {
+        exitSimLoopNow("no update left to process.");
+    }
+}
+
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 1f325703bd..c54c4c04ef 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -29,7 +29,10 @@
 #ifndef __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
 #define __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
 
+#include <vector>
+
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/mpu.hh"
 #include "params/CenteralController.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
@@ -67,20 +70,20 @@ class CenteralController : public ClockedObject
     Addr addr;
     uint32_t value;
 
+    std::vector<MPU*> mpuVector;
     template<typename T> PacketPtr
                               createUpdatePacket(Addr addr, T value);
-
-    virtual void initState();
-    virtual void startup();
-
     void functionalAccess(PacketPtr pkt);
 
   public:
     PARAMS(CenteralController);
     CenteralController(const CenteralControllerParams &params);
-
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
+    virtual void initState();
+    virtual void startup();
+
+    void recvDoneSignal();
 };
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 57bc99013c..d791926fe1 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -30,7 +30,7 @@
 
 #include <bitset>
 
-#include "accl/graph/sega/wl_engine.hh"
+#include "accl/graph/sega/mpu.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
 #include "debug/BitVector.hh"
@@ -38,16 +38,16 @@
 #include "debug/CoalesceEngine.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
 
 namespace gem5
 {
 
 CoalesceEngine::CoalesceEngine(const Params &params):
     BaseMemoryEngine(params),
-    peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    numMSHREntries(params.num_mshr_entry),
+    onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     _workCount(0), numPullsReceived(0),
     nextMemoryEvent([this] {
@@ -66,30 +66,20 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
-
-    peerPushEngine->registerCoalesceEngine(this, numElementsPerLine);
-
     needsPush.reset();
 }
 
 void
-CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
+CoalesceEngine::registerMPU(MPU* mpu)
 {
-    peerWLEngine = wl_engine;
-}
-
-DrainState
-CoalesceEngine::drain()
-{
-    DPRINTF(CoalesceEngine, "%s: drain called.\n");
-    return DrainState::Drained;
+    owner = mpu;
 }
 
 bool
 CoalesceEngine::done()
 {
-    return needsPush.none() &&
-        memoryFunctionQueue.empty() && peerWLEngine->done();
+    return applyQueue.empty() && needsPush.none() &&
+        memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
 
 // addr should be aligned to peerMemoryAtomSize
@@ -153,17 +143,15 @@ CoalesceEngine::recvWLRead(Addr addr)
         responseQueue.push_back(std::make_tuple(addr,
                     cacheBlocks[block_index].items[wl_offset]));
         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d, "
-                        "responseQueueSize = %d.\n", __func__, addr,
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, addr,
                         cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size(),
-                        peerWLEngine->getRegisterFileSize());
+                        responseQueue.size());
         DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d, "
-                        "responseQueueSize = %d.\n", __func__, addr,
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, addr,
                         cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size(),
-                        peerWLEngine->getRegisterFileSize());
+                        responseQueue.size());
         // TODO: Stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         // If they are scheduled for apply and WB those schedules should be
@@ -418,6 +406,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         return true;
     }
 
+    onTheFlyReqs--;
     Addr addr = pkt->getAddr();
     int block_index = getBlockIndex(addr);
 
@@ -439,7 +428,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             if (needsPush[it + i] == 1) {
                 _workCount--;
                 needsPush[it + i] = 0;
-                peerPushEngine->recvVertexPush(vertex_addr, items[i]);
+                owner->recvVertexPush(vertex_addr, items[i]);
                 break;
             }
         }
@@ -492,17 +481,15 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
             DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d, "
-                        "responseQueueSize = %d.\n", __func__, miss_addr,
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
                         cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size(),
-                        peerWLEngine->getRegisterFileSize());
+                        responseQueue.size());
             DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d, "
-                        "responseQueueSize = %d.\n", __func__, addr,
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, addr,
                         cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size(),
-                        peerWLEngine->getRegisterFileSize());
+                        responseQueue.size());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
             cacheBlocks[block_index].lastChangedTick = curTick();
@@ -548,18 +535,18 @@ CoalesceEngine::processNextResponseEvent()
     WorkListItem worklist_response;
 
     std::tie(addr_response, worklist_response) = responseQueue.front();
-    peerWLEngine->handleIncomingWL(addr_response, worklist_response);
+    owner->handleIncomingWL(addr_response, worklist_response);
     DPRINTF(CoalesceEngine,
                 "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
                 __func__, worklist_response.to_string(), addr_response);
 
     responseQueue.pop_front();
     DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
-                responseQueue.size(), peerWLEngine->getRegisterFileSize());
+                "responseQueue.size = %d.\n", __func__,
+                responseQueue.size());
     DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
-                responseQueue.size(), peerWLEngine->getRegisterFileSize());
+                "responseQueue.size = %d.\n", __func__,
+                responseQueue.size());
 
     if ((!nextResponseEvent.scheduled()) &&
         (!responseQueue.empty())) {
@@ -720,8 +707,8 @@ CoalesceEngine::processNextApplyEvent()
                         _workCount++;
                         needsPush[bit_index_base + index] = 1;
                     }
-                    if (!peerPushEngine->running()) {
-                        peerPushEngine->start();
+                    if (!owner->running()) {
+                        owner->start();
                     }
                 }
             }
@@ -760,6 +747,10 @@ CoalesceEngine::processNextApplyEvent()
         (!nextApplyEvent.scheduled())) {
         schedule(nextApplyEvent, nextCycle());
     }
+
+    if (done()) {
+        owner->recvDoneSignal();
+    }
 }
 
 void
@@ -816,6 +807,7 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
 
     memPort.sendPacket(pkt);
+    onTheFlyReqs++;
 }
 
 void
@@ -845,6 +837,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                         "Addr: %lu, size = %d.\n", __func__,
                         pkt->getAddr(), pkt->getSize());
         memPort.sendPacket(pkt);
+        // onTheFlyReqs++;
         cacheBlocks[block_index].needsWB = false;
         cacheBlocks[block_index].pendingWB = false;
 
@@ -955,7 +948,7 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
                 if (needsPush[slice_base + i] == 1) {
                     _workCount--;
                     needsPush[slice_base + i] = 0;
-                    peerPushEngine->recvVertexPush(vertex_addr,
+                    owner->recvVertexPush(vertex_addr,
                                             cacheBlocks[block_index].items[i]);
                     break;
                 }
@@ -967,6 +960,7 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
             SenderState* sender_state = new SenderState(true);
             pkt->pushSenderState(sender_state);
             memPort.sendPacket(pkt);
+            onTheFlyReqs++;
             // TODO: Set a tracking structure so that nextMemoryReadEvent knows
             // It does not have to read this address anymore. It can simply set
             // a flag to true (maybe not even needed just look if the cache has a
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index b19a1bc461..03b463e570 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -33,7 +33,6 @@
 
 #include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
-#include "accl/graph/sega/push_engine.hh"
 #include "base/cprintf.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
@@ -43,7 +42,7 @@
 namespace gem5
 {
 
-class WLEngine;
+class MPU;
 
 class CoalesceEngine : public BaseMemoryEngine
 {
@@ -93,14 +92,13 @@ class CoalesceEngine : public BaseMemoryEngine
       bool isRetry;
       SenderState(bool is_retry): isRetry(is_retry) {}
     };
-
-    WLEngine* peerWLEngine;
-    PushEngine* peerPushEngine;
+    MPU* owner;
 
     int numLines;
     int numElementsPerLine;
     Block* cacheBlocks;
 
+    int onTheFlyReqs;
     int numMSHREntries;
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHR;
@@ -156,11 +154,10 @@ class CoalesceEngine : public BaseMemoryEngine
   public:
     PARAMS(CoalesceEngine);
     CoalesceEngine(const Params &params);
-    virtual DrainState drain() override;
+    void registerMPU(MPU* mpu);
 
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
-    void registerWLEngine(WLEngine* wl_engine);
 
     int workCount() { return _workCount; }
     void recvVertexPull();
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
new file mode 100644
index 0000000000..7b1727587a
--- /dev/null
+++ b/src/accl/graph/sega/mpu.cc
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/mpu.hh"
+
+#include "accl/graph/sega/centeral_controller.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+MPU::MPU(const Params& params):
+    SimObject(params),
+    system(params.system),
+    wlEngine(params.wl_engine),
+    coalesceEngine(params.coalesce_engine),
+    pushEngine(params.push_engine),
+    inPort(name() + ".inPort", this),
+    outPort(name() + ".outPort", this)
+{
+    wlEngine->registerMPU(this);
+    coalesceEngine->registerMPU(this);
+    pushEngine->registerMPU(this);
+}
+
+Port&
+MPU::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "in_port") {
+        return inPort;
+    } else if (if_name == "out_port") {
+        return outPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+void
+MPU::init()
+{
+    localAddrRange = getAddrRanges();
+    inPort.sendRangeChange();
+}
+
+void
+MPU::registerCenteralController(CenteralController* centeral_controller)
+{
+    centeralController = centeral_controller;
+}
+
+AddrRangeList
+MPU::RespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+void
+MPU::RespPort::checkRetryReq()
+{
+    if (needSendRetryReq) {
+        sendRetryReq();
+        needSendRetryReq = false;
+    }
+}
+
+bool
+MPU::RespPort::recvTimingReq(PacketPtr pkt)
+{
+    if (!owner->handleIncomingUpdate(pkt)) {
+        needSendRetryReq = true;
+        return false;
+    }
+
+    return true;
+}
+
+Tick
+MPU::RespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+MPU::RespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+MPU::RespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+MPU::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(blockedPacket != nullptr,
+            "Should never try to send if blocked!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+    } else {
+        owner->recvReqRetry();
+    }
+}
+
+bool
+MPU::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+MPU::ReqPort::recvReqRetry()
+{
+    panic_if(blockedPacket == nullptr,
+            "Received retry without a blockedPacket.");
+
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+}
+
+bool
+MPU::handleIncomingUpdate(PacketPtr pkt)
+{
+    return wlEngine->handleIncomingUpdate(pkt);
+}
+
+void
+MPU::handleIncomingWL(Addr addr, WorkListItem wl)
+{
+    wlEngine->handleIncomingWL(addr, wl);
+}
+
+void
+MPU::recvWLWrite(Addr addr, WorkListItem wl)
+{
+    coalesceEngine->recvWLWrite(addr, wl);
+}
+
+void
+MPU::recvVertexPush(Addr addr, WorkListItem wl)
+{
+    pushEngine->recvVertexPush(addr, wl);
+}
+
+void
+MPU::sendPacket(PacketPtr pkt)
+{
+    bool found_locally = false;
+    for (auto range : localAddrRange) {
+        found_locally |= range.contains(pkt->getAddr());
+    }
+
+    if (found_locally) {
+        // TODO: count number of local updates
+
+    } else {
+        // TOOD: count number of remote updates
+
+    }
+
+    outPort.sendPacket(pkt);
+}
+
+void
+MPU::recvDoneSignal()
+{
+    centeralController->recvDoneSignal();
+}
+
+bool
+MPU::done()
+{
+    return wlEngine->done() && coalesceEngine->done() && pushEngine->done();
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
new file mode 100644
index 0000000000..edf0350caf
--- /dev/null
+++ b/src/accl/graph/sega/mpu.hh
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_MPU_HH__
+#define __ACCL_GRAPH_SEGA_MPU_HH__
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/coalesce_engine.hh"
+#include "accl/graph/sega/push_engine.hh"
+#include "accl/graph/sega/wl_engine.hh"
+#include "base/addr_range.hh"
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "sim/sim_object.hh"
+#include "sim/system.hh"
+#include "params/MPU.hh"
+
+namespace gem5
+{
+
+class CenteralController;
+
+class MPU : public SimObject
+{
+  private:
+    class RespPort : public ResponsePort
+    {
+      private:
+        MPU* owner;
+        bool needSendRetryReq;
+
+      public:
+        RespPort(const std::string& name, MPU* owner):
+          ResponsePort(name, owner), owner(owner), needSendRetryReq(false)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+        void checkRetryReq();
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    class ReqPort : public RequestPort
+    {
+      private:
+        MPU* owner;
+        PacketPtr blockedPacket;
+
+      public:
+        ReqPort(const std::string& name, MPU* owner) :
+          RequestPort(name, owner), owner(owner), blockedPacket(nullptr)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return (blockedPacket != nullptr); }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    System* system;
+    CenteralController* centeralController;
+
+    WLEngine* wlEngine;
+    CoalesceEngine* coalesceEngine;
+    PushEngine* pushEngine;
+
+    RespPort inPort;
+    ReqPort outPort;
+
+    AddrRangeList localAddrRange;
+
+  public:
+    PARAMS(MPU);
+    MPU(const Params& params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+    virtual void init() override;
+    void registerCenteralController(CenteralController* centeral_controller);
+
+    AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
+    void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
+
+    bool handleIncomingUpdate(PacketPtr pkt);
+    void checkRetryReq() { inPort.checkRetryReq(); }
+    void handleIncomingWL(Addr addr, WorkListItem wl);
+    bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
+    void recvWLWrite(Addr addr, WorkListItem wl);
+
+    int workCount() { return coalesceEngine->workCount(); }
+    void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
+    bool running() { return pushEngine->running(); }
+    void start() { return pushEngine->start(); }
+    void recvVertexPush(Addr addr, WorkListItem wl);
+
+    bool blocked() { return outPort.blocked(); }
+    void sendPacket(PacketPtr pkt);
+    void recvReqRetry() { pushEngine->recvReqRetry(); }
+
+    void recvDoneSignal();
+    bool done();
+};
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_MPU_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 9866c30f5c..0134133cfa 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/push_engine.hh"
 
 #include "accl/graph/sega/coalesce_engine.hh"
+#include "accl/graph/sega/mpu.hh"
 #include "debug/PushEngine.hh"
 #include "debug/TempFlag.hh"
 #include "mem/packet_access.hh"
@@ -37,9 +38,8 @@
 namespace gem5
 {
 
-PushEngine::PushEngine(const Params &params):
+PushEngine::PushEngine(const Params& params):
     BaseMemoryEngine(params),
-    reqPort(name() + ".req_port", this),
     _running(false),
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
@@ -49,22 +49,10 @@ PushEngine::PushEngine(const Params &params):
     stats(*this)
 {}
 
-Port&
-PushEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "req_port") {
-        return reqPort;
-    } else {
-        return BaseMemoryEngine::getPort(if_name, idx);
-    }
-}
-
 void
-PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine,
-                                    int elements_per_line)
+PushEngine::registerMPU(MPU* mpu)
 {
-    peerCoalesceEngine = coalesce_engine;
-    numElementsPerLine = elements_per_line;
+    owner = mpu;
 }
 
 void
@@ -77,43 +65,6 @@ PushEngine::recvReqRetry()
     }
 }
 
-void
-PushEngine::ReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    DPRINTF(PushEngine, "%s: Sending pakcet: %s to "
-                "the network.\n", __func__, pkt->print());
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-        DPRINTF(PushEngine, "%s: MemPort blocked.\n", __func__);
-    } else {
-        DPRINTF(PushEngine, "%s: Packet sent successfully.\n", __func__);
-        owner->recvReqRetry();
-    }
-}
-
-bool
-PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-PushEngine::ReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__);
-
-    _blocked = false;
-    PacketPtr pkt = blockedPacket;
-    blockedPacket = nullptr;
-    sendPacket(pkt);
-}
-
 bool
 PushEngine::vertexSpace()
 {
@@ -124,15 +75,17 @@ PushEngine::vertexSpace()
 bool
 PushEngine::workLeft()
 {
-    return ((peerCoalesceEngine->workCount() - numPendingPulls) > 0);
+    return ((owner->workCount() - numPendingPulls) > 0);
 }
 
 bool
 PushEngine::done()
 {
     return edgeQueue.empty() &&
-        edgePointerQueue.empty() && peerCoalesceEngine->done();
+            (onTheFlyMemReqs == 0) &&
+            edgePointerQueue.empty();
 }
+
 void
 PushEngine::start()
 {
@@ -152,7 +105,7 @@ PushEngine::processNextVertexPullEvent()
 {
     // TODO: change edgePointerQueueSize
     numPendingPulls++;
-    peerCoalesceEngine->recvVertexPull();
+    owner->recvVertexPull();
 
     if (!workLeft()) {
         _running = false;
@@ -277,7 +230,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
 void
 PushEngine::processNextPushEvent()
 {
-    if (reqPort.blocked()) {
+    if (owner->blocked()) {
         nextPushEvent.sleep();
         return;
     }
@@ -293,7 +246,7 @@ PushEngine::processNextPushEvent()
     PacketPtr update = createUpdatePacket<uint32_t>(
                             curr_edge.dst, update_value);
 
-    reqPort.sendPacket(update);
+    owner->sendPacket(update);
     stats.numUpdates++;
     DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to addr: %lu "
                         "with value: %d.\n", __func__, curr_edge.src,
@@ -305,10 +258,6 @@ PushEngine::processNextPushEvent()
         edgeQueue.pop_front();
     }
 
-    if (done()) {
-        exitSimLoopNow(name() + " is done.");
-    }
-
     assert(!nextPushEvent.pending());
     assert(!nextPushEvent.scheduled());
     if (!edgeQueue.empty()) {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index a42228f4c0..6f92b62be0 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -38,6 +38,7 @@ namespace gem5
 {
 
 class CoalesceEngine;
+class MPU;
 
 class PushEngine : public BaseMemoryEngine
 {
@@ -89,31 +90,9 @@ class PushEngine : public BaseMemoryEngine
         int numElements;
     };
 
-    class ReqPort : public RequestPort
-    {
-      private:
-        PushEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        ReqPort(const std::string& name, PushEngine* owner) :
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    ReqPort reqPort;
-
     bool _running;
     int numElementsPerLine;
-    CoalesceEngine* peerCoalesceEngine;
+    MPU* owner;
 
     int numPendingPulls;
     int edgePointerQueueSize;
@@ -157,20 +136,15 @@ class PushEngine : public BaseMemoryEngine
 
   public:
     PARAMS(PushEngine);
-    PushEngine(const Params &params);
-
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-
-    void registerCoalesceEngine(CoalesceEngine* coalesce_engine,
-                                          int elements_per_line);
-
-    void recvReqRetry();
+    PushEngine(const Params& params);
+    void registerMPU(MPU* mpu);
 
     void start();
     bool running() { return _running; }
     void recvVertexPush(Addr addr, WorkListItem wl);
 
+    void recvReqRetry();
+
     bool done();
 };
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index e999667ad1..9890eeed76 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -28,103 +28,61 @@
 
 #include "accl/graph/sega/wl_engine.hh"
 
+#include "accl/graph/sega/mpu.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "debug/WLEngine.hh"
 #include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
 
 namespace gem5
 {
 
-WLEngine::WLEngine(const WLEngineParams &params):
+WLEngine::WLEngine(const WLEngineParams& params):
     BaseReduceEngine(params),
-    respPort(name() + ".resp_port", this),
-    coalesceEngine(params.coalesce_engine),
     updateQueueSize(params.update_queue_size),
     registerFileSize(params.register_file_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)
-{
-    coalesceEngine->registerWLEngine(this);
-}
-
-Port&
-WLEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "resp_port") {
-        return respPort;
-    } else {
-        return BaseReduceEngine::getPort(if_name, idx);
-    }
-}
+{}
 
 void
-WLEngine::init()
+WLEngine::registerMPU(MPU* mpu)
 {
-    respPort.sendRangeChange();
+    owner = mpu;
 }
 
-AddrRangeList
-WLEngine::RespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-void
-WLEngine::RespPort::checkRetryReq()
+bool
+WLEngine::done()
 {
-    if (needSendRetryReq) {
-        DPRINTF(WLEngine,  "%s: Sending a RetryReq.\n", __func__);
-        sendRetryReq();
-        needSendRetryReq = false;
-    }
+    return registerFile.empty() && updateQueue.empty();
 }
 
 bool
-WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
+WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
-    if (!owner->handleIncomingUpdate(pkt)) {
-        needSendRetryReq = true;
+    assert(updateQueue.size() <= updateQueueSize);
+    if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
         return false;
     }
 
-    return true;
-}
-
-Tick
-WLEngine::RespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-WLEngine::RespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-WLEngine::RespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-void
-WLEngine::recvFunctional(PacketPtr pkt)
-{
-    coalesceEngine->recvFunctional(pkt);
-}
+    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
+    DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
+                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
+                updateQueue.size(), updateQueueSize);
+    DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the "
+                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
+                updateQueue.size(), updateQueueSize);
 
-AddrRangeList
-WLEngine::getAddrRanges() const
-{
-    return coalesceEngine->getAddrRanges();
-}
+    // delete the packet since it's not needed anymore.
+    delete pkt;
 
-bool
-WLEngine::done()
-{
-    return registerFile.empty() && updateQueue.empty();
+    if (!nextReadEvent.scheduled()) {
+        schedule(nextReadEvent, nextCycle());
+    }
+    return true;
 }
 
 // TODO: Parameterize the number of pops WLEngine can do at a time.
@@ -150,7 +108,7 @@ WLEngine::processNextReadEvent()
             // return a boolean value. It should return an integer/enum
             // to tell WLEngine why it rejected the read request. Their might
             // be things that WLEngine can do to fix head of the line blocking.
-            if (coalesceEngine->recvWLRead(update_addr)) {
+            if (owner->recvWLRead(update_addr)) {
                 DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read "
                             "request to addr: %lu.\n", __func__, update_addr);
                 registerFile[update_addr] = update_value;
@@ -171,7 +129,7 @@ WLEngine::processNextReadEvent()
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
                             update_value, updateQueue.size(), updateQueueSize);
-                respPort.checkRetryReq();
+                owner->checkRetryReq();
             }
         }
     } else {
@@ -194,7 +152,7 @@ WLEngine::processNextReadEvent()
                     "from updateQueue. updateQueue.size = %d. "
                     "updateQueueSize = %d.\n", __func__, update_addr,
                     update_value, updateQueue.size(), updateQueueSize);
-        respPort.checkRetryReq();
+        owner->checkRetryReq();
     }
 
     if (!updateQueue.empty() && (!nextReadEvent.scheduled())) {
@@ -238,7 +196,7 @@ WLEngine::processNextReduceEvent()
                             __func__, addr, workListFile[addr].to_string());
         stats.numReduce++;
 
-        coalesceEngine->recvWLWrite(addr, workListFile[addr]);
+        owner->recvWLWrite(addr, workListFile[addr]);
         registerFile.erase(addr);
         DPRINTF(SEGAStructureSize, "%s: Removed addr: %lu from registerFile. "
                     "registerFile.size = %d, registerFileSize = %d\n",
@@ -248,40 +206,15 @@ WLEngine::processNextReduceEvent()
                     __func__, addr, registerFile.size(), registerFileSize);
     }
     workListFile.clear();
-}
 
-bool
-WLEngine::handleIncomingUpdate(PacketPtr pkt)
-{
-    assert(updateQueue.size() <= updateQueueSize);
-    if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
-        return false;
+    if (done()) {
+        owner->recvDoneSignal();
     }
-
-    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
-    DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
-                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
-                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
-                updateQueue.size(), updateQueueSize);
-    DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the "
-                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
-                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
-                updateQueue.size(), updateQueueSize);
-
-
-    // delete the packet since it's not needed anymore.
-    delete pkt;
-
-    if (!nextReadEvent.scheduled()) {
-        schedule(nextReadEvent, nextCycle());
-    }
-    return true;
 }
 
 WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
     : statistics::Group(&_wl),
     wl(_wl),
-
     ADD_STAT(numReduce, statistics::units::Count::get(),
              "Number of memory blocks read for vertecies"),
     ADD_STAT(registerFileCoalesce, statistics::units::Count::get(),
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 1360d37132..4a0489b123 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -34,42 +34,18 @@
 
 #include "accl/graph/base/base_reduce_engine.hh"
 #include "accl/graph/base/data_structs.hh"
-#include "accl/graph/sega/coalesce_engine.hh"
 #include "base/statistics.hh"
 #include "params/WLEngine.hh"
 
 namespace gem5
 {
 
+class MPU;
+
 class WLEngine : public BaseReduceEngine
 {
   private:
-    class RespPort : public ResponsePort
-    {
-      private:
-        WLEngine* owner;
-        bool needSendRetryReq;
-
-      public:
-        RespPort(const std::string& name, WLEngine* owner):
-          ResponsePort(name, owner), owner(owner), needSendRetryReq(false)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-        void checkRetryReq();
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    virtual void init();
-
-    RespPort respPort;
-
-    CoalesceEngine* coalesceEngine;
+    MPU* owner;
 
     int updateQueueSize;
     std::deque<std::tuple<Addr, uint32_t>> updateQueue;
@@ -79,9 +55,6 @@ class WLEngine : public BaseReduceEngine
 
     std::unordered_map<Addr, WorkListItem> workListFile;
 
-    void recvFunctional(PacketPtr pkt);
-    AddrRangeList getAddrRanges() const;
-
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
@@ -104,18 +77,12 @@ class WLEngine : public BaseReduceEngine
 
   public:
     PARAMS(WLEngine);
-
-    WLEngine(const WLEngineParams &params);
-
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
+    WLEngine(const Params& params);
+    void registerMPU(MPU* mpu);
 
     bool handleIncomingUpdate(PacketPtr pkt);
-
     void handleIncomingWL(Addr addr, WorkListItem wl);
 
-    int getRegisterFileSize() { return registerFileSize; }
-
     bool done();
 };
 

From 86b82a7286a47a66c9df0b75ef6501d56cefaea3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 1 Sep 2022 21:24:27 -0700
Subject: [PATCH 147/247] Minor improvements in the code.

---
 src/accl/graph/sega/coalesce_engine.cc | 60 ++++++++------------------
 src/accl/graph/sega/coalesce_engine.hh |  7 ++-
 2 files changed, 22 insertions(+), 45 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index d791926fe1..ba7878be7a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -140,8 +140,9 @@ CoalesceEngine::recvWLRead(Addr addr)
         // TODO: Add a hit latency as a param for this object.
         // Can't just schedule the nextResponseEvent for latency cycles in
         // the future.
-        responseQueue.push_back(std::make_tuple(addr,
-                    cacheBlocks[block_index].items[wl_offset]));
+        responseQueue.push_back(std::make_tuple(
+            addr, cacheBlocks[block_index].items[wl_offset]));
+
         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d.\n",
                         __func__, addr,
@@ -434,6 +435,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         }
         DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                             __func__, needsPush.count());
+
+        pendingVertexPullReads.erase(addr);
         delete pkt;
         return true;
     }
@@ -466,12 +469,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         delete pkt;
     }
 
-    // FIXME: Get rid of servicedIndices (maybe use an iterator)
-    std::vector<int> servicedIndices;
-    for (int i = 0; i < MSHR[block_index].size(); i++) {
-        Addr miss_addr = MSHR[block_index][i];
+    for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+        Addr miss_addr = *it;
         Addr aligned_miss_addr =
             roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
             DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
@@ -495,28 +497,14 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             cacheBlocks[block_index].lastChangedTick = curTick();
             DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
-            // End of the said block
-            servicedIndices.push_back(i);
-            // DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cacheBlocks[%d] for "
-            //             "removal.\n", __func__, i, block_index);
+            it = MSHR[block_index].erase(it);
+        } else {
+            it++;
         }
     }
 
-    // TODO: We Can use taken instead of this
-    // TODO: Change the MSHR from map<Addr, vector> to map<Addr, list>
-    int bias = 0;
-    for (int i = 0; i < servicedIndices.size(); i++) {
-        Addr print_addr = MSHR[block_index][i - bias];
-        MSHR[block_index].erase(MSHR[block_index].begin() +
-                                    servicedIndices[i] - bias);
-        bias++;
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu has been serviced "
-                        "and is removed.\n", __func__, print_addr);
-    }
-
     if (MSHR[block_index].empty()) {
         MSHR.erase(block_index);
-        // cacheBlocks[block_index].hasConflict = false;
     }
 
     if ((!nextResponseEvent.scheduled()) &&
@@ -902,24 +890,8 @@ CoalesceEngine::getOptimalBitVectorSlice()
             (!cacheBlocks[block_index].pendingWB)) {
             assert(!cacheBlocks[block_index].needsApply);
             assert(!cacheBlocks[block_index].pendingData);
-            // current_score += numElementsPerLine * 2;
-            // if (current_score > score) {
-            //     score = current_score;
-            //     slice_base = it;
-            //     hit_in_cache = true;
-            //     if (score == max_score_possible) {
-            //         break;
-            //     }
-            // }
             return std::make_tuple(true, it);
         } else if (cacheBlocks[block_index].addr != addr) {
-            // score += numElementsPerLine;
-            // if (current_score > score) {
-            //     score = current_score;
-            //     slice_base = it;
-            //     hit_in_cache = false;
-            //     assert(score < max_score_possible);
-            // }
             return std::make_tuple(false, it);
         }
     }
@@ -928,7 +900,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
 }
 
 void
-CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
+CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
 {
     bool hit_in_cache;
     int slice_base;
@@ -961,6 +933,8 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
             pkt->pushSenderState(sender_state);
             memPort.sendPacket(pkt);
             onTheFlyReqs++;
+
+            pendingVertexPullReads.insert(addr);
             // TODO: Set a tracking structure so that nextMemoryReadEvent knows
             // It does not have to read this address anymore. It can simply set
             // a flag to true (maybe not even needed just look if the cache has a
@@ -972,9 +946,9 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
     if (numPullsReceived > 0) {
         memoryFunctionQueue.emplace_back(
             [this] (int slice_base, Tick schedule_tick) {
-            processNextPushRetry(slice_base, schedule_tick);
+            processNextVertexPull(slice_base, schedule_tick);
         }, 0, curTick());
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input "
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
                                     "0 to memoryFunctionQueue.\n", __func__);
     }
 }
@@ -999,7 +973,7 @@ CoalesceEngine::recvVertexPull()
     numPullsReceived++;
     memoryFunctionQueue.emplace_back(
         [this] (int slice_base, Tick schedule_tick) {
-        processNextPushRetry(slice_base, schedule_tick);
+        processNextVertexPull(slice_base, schedule_tick);
     }, 0, curTick());
     if ((!nextMemoryEvent.pending()) &&
         (!nextMemoryEvent.scheduled())) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 03b463e570..75c36f9c03 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -114,12 +114,15 @@ class CoalesceEngine : public BaseMemoryEngine
     Addr getBlockAddrFromBitIndex(int index);
     std::tuple<bool, int> getOptimalBitVectorSlice();
 
+    std::unordered_set<Addr> pendingVertexPullReads;
+
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
     void processNextRead(int block_index, Tick schedule_tick);
     void processNextWriteBack(int block_index, Tick schedule_tick);
-    void processNextPushRetry(int slice_base, Tick schedule_tick);
-    std::deque<std::tuple<std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
+    void processNextVertexPull(int slice_base, Tick schedule_tick);
+    std::deque<std::tuple<
+        std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
 
     EventFunctionWrapper nextResponseEvent;
     void processNextResponseEvent();

From 8bbe1cd51f5d04ddb366519316e4427840c69943 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Thu, 1 Sep 2022 21:00:19 -0700
Subject: [PATCH 148/247] Added HBM as vertex memory. It doesn't exit!

---
 configs/accl/sega.py | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index a0bfb5ddce..2c44c1f7eb 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -20,20 +20,26 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
         return ret
 
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str):
+    def __init__(self, edge_memory_size, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(update_queue_size=64,
                                 register_file_size=32)
         self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
-                                            cache_size="8MiB",
+                                            cache_size=cache_size,
                                             num_mshr_entry=32,
                                             num_tgts_per_mshr=16)
         self.push_engine = PushEngine(push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64)
-        self.vertex_mem_ctrl = SimpleMemory(latency="30ns",
-                                            latency_var="0ns",
-                                            bandwidth="19.2GiB/s")
+        
+        vertex_interface = HBM_1000_4H_1x128()
+        # vertex_interface.range = self._vertex_ranges[i]
+        ctrl = MemCtrl()
+        ctrl.dram = vertex_interface
+        self.vertex_mem_ctrl = ctrl
+        # self.vertex_mem_ctrl = SimpleMemory(latency="30ns",
+        #                                     latency_var="0ns",
+        #                                     bandwidth="19.2GiB/s")
         self.edge_mem_ctrl = SimpleMemory(latency="30ns",
                                         latency_var="0ns",
                                         bandwidth="19.2GiB/s",
@@ -58,7 +64,8 @@ def setReqPort(self, port):
         self.mpu.out_port = port
 
     def set_vertex_range(self, vertex_range):
-        self.vertex_mem_ctrl.range = vertex_range
+        # self.vertex_mem_ctrl.range = vertex_range
+        self.vertex_mem_ctrl.dram.range = vertex_range
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.image_file = edge_image
 
@@ -66,6 +73,7 @@ class SEGA(System):
     def __init__(self,
                 num_mpus,
                 vertex_cache_line_size,
+                cache_size,
                 graph_path,
                 first_addr,
                 first_value):
@@ -85,11 +93,15 @@ def __init__(self,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
-        vertex_ranges = interleave_addresses(AddrRange("4GiB"), num_mpus, vertex_cache_line_size)
+        # vertex_ranges = interleave_addresses(AddrRange("4GiB"), num_mpus, vertex_cache_line_size)
+        vertex_ranges = interleave_addresses(
+                                AddrRange(start=0, size="4GiB"),\
+                                num_mpus,\
+                                vertex_cache_line_size)
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("8GiB")
+            gpt = GPT("8GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpt.setReqPort(self.interconnect.cpu_side_ports)
@@ -103,19 +115,21 @@ def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_mpus", type=int)
     argparser.add_argument("vertex_cache_line_size", type=int)
+    argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph_path", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
     args = argparser.parse_args()
-    return args.num_mpus, args.vertex_cache_line_size, \
+    print("******* ", args.cache_size)
+    return args.num_mpus, args.vertex_cache_line_size, args.cache_size, \
             args.graph_path, args.init_addr, args.init_value
 
 if __name__ == "__m5_main__":
-    num_mpus, vertex_cache_line_size, \
+    num_mpus, vertex_cache_line_size, cache_size, \
         graph_path, first_addr, first_value = get_inputs()
 
     print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
-    system = SEGA(num_mpus, vertex_cache_line_size, \
+    system = SEGA(num_mpus, vertex_cache_line_size, cache_size, \
                 graph_path, first_addr, first_value)
     root = Root(full_system = False, system = system)
 

From 25ded8a0636ea641d9da9a8cbe913f91e9f0c08b Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Thu, 1 Sep 2022 21:24:19 -0700
Subject: [PATCH 149/247] Adding Real memory for EM

---
 configs/accl/sega.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 2c44c1f7eb..e9286deafc 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -20,7 +20,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
         return ret
 
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size, cache_size: str):
+    def __init__(self, edge_memory_size, cache_size: str, i):
         super().__init__()
         self.wl_engine = WLEngine(update_queue_size=64,
                                 register_file_size=32)
@@ -40,11 +40,13 @@ def __init__(self, edge_memory_size, cache_size: str):
         # self.vertex_mem_ctrl = SimpleMemory(latency="30ns",
         #                                     latency_var="0ns",
         #                                     bandwidth="19.2GiB/s")
-        self.edge_mem_ctrl = SimpleMemory(latency="30ns",
-                                        latency_var="0ns",
-                                        bandwidth="19.2GiB/s",
-                                        range=AddrRange(edge_memory_size),
-                                        in_addr_map=False)
+        edge_interface = DDR4_2400_8x8(
+                device_size = edge_memory_size, 
+                image_file = f"{graph_path}/edgelist_{i}", 
+                in_addr_map=False)
+        edge_ctrl = MemCtrl()
+        edge_ctrl.dram = edge_interface
+        self.edge_mem_ctrl = edge_ctrl
 
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
@@ -67,7 +69,7 @@ def set_vertex_range(self, vertex_range):
         # self.vertex_mem_ctrl.range = vertex_range
         self.vertex_mem_ctrl.dram.range = vertex_range
     def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.image_file = edge_image
+        self.edge_mem_ctrl.dram.image_file = edge_image
 
 class SEGA(System):
     def __init__(self,
@@ -101,7 +103,7 @@ def __init__(self,
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("8GiB", cache_size)
+            gpt = GPT("8GiB", cache_size, i)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpt.setReqPort(self.interconnect.cpu_side_ports)

From 0f69be29a97f915680b809fb3febc19543c60c99 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 1 Sep 2022 21:38:00 -0700
Subject: [PATCH 150/247] Fixing style.

---
 configs/accl/sega.py | 37 ++++++++++++++-----------------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index e9286deafc..1e360676cb 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -20,7 +20,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
         return ret
 
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size, cache_size: str, i):
+    def __init__(self, edge_memory_size, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(update_queue_size=64,
                                 register_file_size=32)
@@ -31,18 +31,14 @@ def __init__(self, edge_memory_size, cache_size: str, i):
         self.push_engine = PushEngine(push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64)
-        
-        vertex_interface = HBM_1000_4H_1x128()
-        # vertex_interface.range = self._vertex_ranges[i]
+
+        vertex_interface = HBM_1000_4H_1x128(burst_length=2)
         ctrl = MemCtrl()
         ctrl.dram = vertex_interface
         self.vertex_mem_ctrl = ctrl
-        # self.vertex_mem_ctrl = SimpleMemory(latency="30ns",
-        #                                     latency_var="0ns",
-        #                                     bandwidth="19.2GiB/s")
+
         edge_interface = DDR4_2400_8x8(
-                device_size = edge_memory_size, 
-                image_file = f"{graph_path}/edgelist_{i}", 
+                device_size = edge_memory_size,
                 in_addr_map=False)
         edge_ctrl = MemCtrl()
         edge_ctrl.dram = edge_interface
@@ -74,7 +70,6 @@ def set_edge_image(self, edge_image):
 class SEGA(System):
     def __init__(self,
                 num_mpus,
-                vertex_cache_line_size,
                 cache_size,
                 graph_path,
                 first_addr,
@@ -83,7 +78,7 @@ def __init__(self,
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
-        self.cache_line_size = vertex_cache_line_size
+        self.cache_line_size = 32
         self.mem_mode = "timing"
 
         self.interconnect = NoncoherentXBar(frontend_latency=1,
@@ -95,15 +90,14 @@ def __init__(self,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
-        # vertex_ranges = interleave_addresses(AddrRange("4GiB"), num_mpus, vertex_cache_line_size)
         vertex_ranges = interleave_addresses(
-                                AddrRange(start=0, size="4GiB"),\
-                                num_mpus,\
-                                vertex_cache_line_size)
+                            AddrRange(start=0, size="4GiB"),
+                            num_mpus,
+                            32)
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("8GiB", cache_size, i)
+            gpt = GPT("8GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpt.setReqPort(self.interconnect.cpu_side_ports)
@@ -116,23 +110,20 @@ def __init__(self,
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_mpus", type=int)
-    argparser.add_argument("vertex_cache_line_size", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph_path", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
     args = argparser.parse_args()
-    print("******* ", args.cache_size)
-    return args.num_mpus, args.vertex_cache_line_size, args.cache_size, \
+
+    return args.num_mpus, args.cache_size, \
             args.graph_path, args.init_addr, args.init_value
 
 if __name__ == "__m5_main__":
-    num_mpus, vertex_cache_line_size, cache_size, \
-        graph_path, first_addr, first_value = get_inputs()
+    num_mpus, cache_size, graph_path, first_addr, first_value = get_inputs()
 
     print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
-    system = SEGA(num_mpus, vertex_cache_line_size, cache_size, \
-                graph_path, first_addr, first_value)
+    system = SEGA(num_mpus, cache_size, graph_path, first_addr, first_value)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()

From 16bb60f064fadacb1a8cb62eaf6bc0d0a6aacffd Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 1 Sep 2022 21:44:37 -0700
Subject: [PATCH 151/247] Khoshgelation.

---
 configs/accl/sega.py | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 1e360676cb..b023507a39 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -22,27 +22,21 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 class GPT(SubSystem):
     def __init__(self, edge_memory_size, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(update_queue_size=64,
+        self.wl_engine = WLEngine(update_queue_size=32,
                                 register_file_size=32)
         self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
                                             cache_size=cache_size,
                                             num_mshr_entry=32,
-                                            num_tgts_per_mshr=16)
+                                            num_tgts_per_mshr=32)
         self.push_engine = PushEngine(push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64)
 
-        vertex_interface = HBM_1000_4H_1x128(burst_length=2)
-        ctrl = MemCtrl()
-        ctrl.dram = vertex_interface
-        self.vertex_mem_ctrl = ctrl
+        self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
 
-        edge_interface = DDR4_2400_8x8(
-                device_size = edge_memory_size,
-                in_addr_map=False)
-        edge_ctrl = MemCtrl()
-        edge_ctrl.dram = edge_interface
-        self.edge_mem_ctrl = edge_ctrl
+        self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
+                                            range=AddrRange(edge_memory_size),
+                                            in_addr_map=False))
 
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
@@ -62,7 +56,6 @@ def setReqPort(self, port):
         self.mpu.out_port = port
 
     def set_vertex_range(self, vertex_range):
-        # self.vertex_mem_ctrl.range = vertex_range
         self.vertex_mem_ctrl.dram.range = vertex_range
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image

From 99f997f387edb67177ee3789522db4d0f0f986be Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 2 Sep 2022 07:47:19 -0700
Subject: [PATCH 152/247] Adding new stats.

---
 configs/accl/sega.py                   |  3 +-
 src/accl/graph/sega/CoalesceEngine.py  |  2 +
 src/accl/graph/sega/coalesce_engine.cc | 71 +++++++++++++++++---------
 src/accl/graph/sega/coalesce_engine.hh |  8 +--
 4 files changed, 56 insertions(+), 28 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index b023507a39..5cf557719f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -27,7 +27,8 @@ def __init__(self, edge_memory_size, cache_size: str):
         self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
                                             cache_size=cache_size,
                                             num_mshr_entry=32,
-                                            num_tgts_per_mshr=32)
+                                            num_tgts_per_mshr=32,
+                                            max_resp_per_cycle=4)
         self.push_engine = PushEngine(push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64)
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 14902ef352..2cc756ff3f 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -37,3 +37,5 @@ class CoalesceEngine(BaseMemoryEngine):
     cache_size = Param.MemorySize("Size of the internal SRAM array.")
     num_mshr_entry = Param.Int("Number of MSHR entries.")
     num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
+    max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
+                                "requestor in each cycle. Used to limit b/w.")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index ba7878be7a..1715d637f1 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -49,6 +49,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
+    maxRespPerCycle(params.max_resp_per_cycle),
     _workCount(0), numPullsReceived(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
@@ -141,7 +142,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         // Can't just schedule the nextResponseEvent for latency cycles in
         // the future.
         responseQueue.push_back(std::make_tuple(
-            addr, cacheBlocks[block_index].items[wl_offset]));
+            addr, cacheBlocks[block_index].items[wl_offset], curTick()));
 
         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d.\n",
@@ -197,6 +198,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                             "cacheBlocks[%d].\n", __func__, block_index);
         }
         MSHR[block_index].push_back(addr);
+        stats.mshrEntryLength.sample(MSHR[block_index].size());
         DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
                 "for cacheBlocks[%d].\n", __func__, addr, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
@@ -312,6 +314,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                     }
                     // cacheBlocks[block_index].hasConflict = true;
                     MSHR[block_index].push_back(addr);
+                    stats.mshrEntryLength.sample(MSHR[block_index].size());
                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
@@ -344,6 +347,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
                                 " Addr: %lu.\n", __func__, block_index, addr);
                     MSHR[block_index].push_back(addr);
+                    stats.mshrEntryLength.sample(MSHR[block_index].size());
                     DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
                     memoryFunctionQueue.emplace_back(
@@ -382,11 +386,11 @@ CoalesceEngine::recvWLRead(Addr addr)
             DPRINTF(CoalesceEngine, "%s: There is room for another target "
                             "for cacheBlocks[%d].\n", __func__, block_index);
 
-            // cacheBlocks[block_index].hasConflict = true;
             // TODO: Might want to differentiate between different misses.
             stats.readMisses++;
 
             MSHR[block_index].push_back(addr);
+            stats.mshrEntryLength.sample(MSHR[block_index].size());
             DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
                             "cacheBlocks[%d].\n", __func__, addr, block_index);
             stats.numVertexReads++;
@@ -481,7 +485,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         "packet.\n",__func__, miss_addr, block_index);
             // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
-                    cacheBlocks[block_index].items[wl_offset]));
+                    cacheBlocks[block_index].items[wl_offset], curTick()));
             DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d.\n",
                         __func__, miss_addr,
@@ -519,22 +523,36 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 void
 CoalesceEngine::processNextResponseEvent()
 {
+    int num_responses_sent = 0;
+
     Addr addr_response;
     WorkListItem worklist_response;
-
-    std::tie(addr_response, worklist_response) = responseQueue.front();
-    owner->handleIncomingWL(addr_response, worklist_response);
-    DPRINTF(CoalesceEngine,
-                "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
-                __func__, worklist_response.to_string(), addr_response);
-
-    responseQueue.pop_front();
-    DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d.\n", __func__,
-                responseQueue.size());
-    DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d.\n", __func__,
-                responseQueue.size());
+    Tick response_queueing_tick;
+    while(true) {
+        std::tie(addr_response, worklist_response, response_queueing_tick) =
+                                                        responseQueue.front();
+        Tick waiting_ticks = curTick() - response_queueing_tick;
+        if (ticksToCycles(waiting_ticks) < 1) {
+            break;
+        }
+        owner->handleIncomingWL(addr_response, worklist_response);
+        num_responses_sent++;
+        DPRINTF(CoalesceEngine,
+                    "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
+                    __func__, worklist_response.to_string(), addr_response);
+
+        responseQueue.pop_front();
+        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
+                    "responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
+                    "responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        if ((num_responses_sent >= maxRespPerCycle) ||
+            (responseQueue.empty())) {
+                break;
+        }
+    }
 
     if ((!nextResponseEvent.scheduled()) &&
         (!responseQueue.empty())) {
@@ -694,9 +712,9 @@ CoalesceEngine::processNextApplyEvent()
                     if (needsPush[bit_index_base + index] == 0) {
                         _workCount++;
                         needsPush[bit_index_base + index] = 1;
-                    }
-                    if (!owner->running()) {
-                        owner->start();
+                        if (!owner->running()) {
+                            owner->start();
+                        }
                     }
                 }
             }
@@ -997,10 +1015,10 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache hit under misses."),
     ADD_STAT(readRejections, statistics::units::Count::get(),
              "Number of cache rejections."),
-    ADD_STAT(falseApplySchedules, statistics::units::Count::get(),
-             "Number of failed apply schedules."),
-    ADD_STAT(falseEvictSchedules, statistics::units::Count::get(),
-             "Number of failed evict schedules.")
+    ADD_STAT(hitRate, statistics::units::Ratio::get(),
+             "Hit rate in the cache."),
+    ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
+             "Histogram on the length of the mshr entries.")
 {
 }
 
@@ -1008,6 +1026,11 @@ void
 CoalesceEngine::CoalesceStats::regStats()
 {
     using namespace statistics;
+
+    mshrEntryLength.init(64);
+
+    hitRate = (readHits + readHitUnderMisses) /
+                (readHits + readHitUnderMisses + readMisses);
 }
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 75c36f9c03..641ed327bb 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -102,7 +102,8 @@ class CoalesceEngine : public BaseMemoryEngine
     int numMSHREntries;
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHR;
-    std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
+    int maxRespPerCycle;
+    std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
     int _workCount;
     int numPullsReceived;
@@ -144,8 +145,9 @@ class CoalesceEngine : public BaseMemoryEngine
       statistics::Scalar readMisses;
       statistics::Scalar readHitUnderMisses;
       statistics::Scalar readRejections;
-      statistics::Scalar falseApplySchedules;
-      statistics::Scalar falseEvictSchedules;
+
+      statistics::Formula hitRate;
+      statistics::Histogram mshrEntryLength;
     };
 
     CoalesceStats stats;

From c8a4614a803d97b0c714637cc3196e8df646338a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 4 Sep 2022 20:42:43 -0700
Subject: [PATCH 153/247] Fixing asserion error on busyMask.

---
 configs/accl/sega.py                   |  2 +-
 src/accl/graph/sega/busyMaskErr        | 16 ++++++++++++++++
 src/accl/graph/sega/coalesce_engine.cc |  7 ++++++-
 3 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 src/accl/graph/sega/busyMaskErr

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 5cf557719f..3fa5b99b3a 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -20,7 +20,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
         return ret
 
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size, cache_size: str):
+    def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(update_queue_size=32,
                                 register_file_size=32)
diff --git a/src/accl/graph/sega/busyMaskErr b/src/accl/graph/sega/busyMaskErr
new file mode 100644
index 0000000000..316fcd37d9
--- /dev/null
+++ b/src/accl/graph/sega/busyMaskErr
@@ -0,0 +1,16 @@
+gem5/build/NULL/gem5.opt -re --outdir=debug --debug-flags=CacheBlockState gem5/configs/accl/sega.py 1 1KiB /home/fariborz/SEGA/graphs/test/scale_21/binaries/mpu_1/ 0 0
+
+32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964145000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}.
+32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}.
+32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlock[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}.
+32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}.
+32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}.
+32964147000: system.gpts.coalesce_engine: processNextWriteBack: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}.
+
+// This assertion would be hit although it should not.
+// It is fixed by a hack in recvWLRead when hit in the cache.
+assert(cacheBlocks[block_index].busyMask == 0);
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 1715d637f1..3ff867c274 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -162,7 +162,12 @@ CoalesceEngine::recvWLRead(Addr addr)
         // and skip the process if the respective bit is set to false.
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].pendingWB = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
+        // HACK: If a read happens on the same cycle as another operation such
+        // apply setLastChangedTick to half a cycle later so that operations
+        // scheduled by the original operation (apply in this example) are
+        // invalidated. For more details refer to "accl/graph/sega/busyMaskErr"
+        cacheBlocks[block_index].lastChangedTick =
+                                    curTick() + (Tick) (clockPeriod() / 2);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
 

From 9ad5fa2f9175be1f2254bc2a0d7b92764b71d96f Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 5 Sep 2022 14:27:49 -0700
Subject: [PATCH 154/247] Fixing finding work in coalesce engine.

---
 src/accl/graph/sega/coalesce_engine.cc | 90 ++++++++++++++------------
 src/accl/graph/sega/coalesce_engine.hh |  3 +-
 src/accl/graph/sega/mpu.cc             |  4 +-
 3 files changed, 54 insertions(+), 43 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 3ff867c274..7a52d29c98 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -50,7 +50,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
-    _workCount(0), numPullsReceived(0),
+    _workCount(0), numPullsReceived(0),  startSearchIndex(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -79,6 +79,9 @@ CoalesceEngine::registerMPU(MPU* mpu)
 bool
 CoalesceEngine::done()
 {
+    bool push_none = needsPush.none();
+    DPRINTF(CoalesceEngine, "%s: needsPush.none: %s.\n", 
+                    __func__, push_none ? "true" : "false");
     return applyQueue.empty() && needsPush.none() &&
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
@@ -885,41 +888,46 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
     }
 }
 
-std::tuple<bool, int>
-CoalesceEngine::getOptimalBitVectorSlice()
+std::tuple<bool, int, Addr>
+CoalesceEngine::getOptimalPullAddr()
 {
-    bool hit_in_cache = false;
-    int slice_base = -1;
-
-    // int score = 0;
-    // int max_score_possible = 3 * numElementsPerLine;
-    for (int it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
-        // int current_score = 0;
+    int it = startSearchIndex;
+    int initial_search_index = startSearchIndex;
+    while (true) {
         uint32_t current_popcount = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
             current_popcount += needsPush[it + i];
         }
-        if (current_popcount == 0) {
-            continue;
+        if (current_popcount != 0) {
+            Addr addr = getBlockAddrFromBitIndex(it);
+            int block_index = getBlockIndex(addr);
+            // Only if it is in cache and it is in idle state.
+            if ((cacheBlocks[block_index].addr == addr) &&
+                (cacheBlocks[block_index].valid) &&
+                (cacheBlocks[block_index].busyMask == 0) &&
+                (!cacheBlocks[block_index].pendingApply) &&
+                (!cacheBlocks[block_index].pendingWB)) {
+                assert(!cacheBlocks[block_index].needsApply);
+                assert(!cacheBlocks[block_index].pendingData);
+                startSearchIndex = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+                return std::make_tuple(true, it, addr);
+            // Otherwise if it is in memory
+            } else if (cacheBlocks[block_index].addr != addr) {
+                if (pendingVertexPullReads.find(addr) != 
+                                                pendingVertexPullReads.end()) {
+                    startSearchIndex = 
+                                    (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+                    return std::make_tuple(true, it, addr);
+                }
+            }
         }
-        // current_score += current_popcount;
-        Addr addr = getBlockAddrFromBitIndex(it);
-        int block_index = getBlockIndex(addr);
-        // Idle state: valid && !pendingApply && !pendingWB
-        if ((cacheBlocks[block_index].addr == addr) &&
-            (cacheBlocks[block_index].valid) &&
-            (cacheBlocks[block_index].busyMask == 0) &&
-            (!cacheBlocks[block_index].pendingApply) &&
-            (!cacheBlocks[block_index].pendingWB)) {
-            assert(!cacheBlocks[block_index].needsApply);
-            assert(!cacheBlocks[block_index].pendingData);
-            return std::make_tuple(true, it);
-        } else if (cacheBlocks[block_index].addr != addr) {
-            return std::make_tuple(false, it);
+        it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+        if (it == initial_search_index) {
+            break;
         }
     }
-
-    return std::make_tuple(hit_in_cache, slice_base);
+    // return garbage
+    return std::make_tuple(false, -1, 0); 
 }
 
 void
@@ -927,10 +935,10 @@ CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
 {
     bool hit_in_cache;
     int slice_base;
-    std::tie(hit_in_cache, slice_base) = getOptimalBitVectorSlice();
+    Addr addr;
 
+    std::tie(hit_in_cache, slice_base, addr) = getOptimalPullAddr();
     if (slice_base != -1) {
-        Addr addr = getBlockAddrFromBitIndex(slice_base);
         int block_index = getBlockIndex(addr);
         if (hit_in_cache) {
             assert(cacheBlocks[block_index].valid);
@@ -958,10 +966,6 @@ CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
             onTheFlyReqs++;
 
             pendingVertexPullReads.insert(addr);
-            // TODO: Set a tracking structure so that nextMemoryReadEvent knows
-            // It does not have to read this address anymore. It can simply set
-            // a flag to true (maybe not even needed just look if the cache has a
-            // line allocated for it in the cacheBlocks).
         }
         numPullsReceived--;
     }
@@ -993,14 +997,18 @@ CoalesceEngine::recvMemRetry()
 void
 CoalesceEngine::recvVertexPull()
 {
+    bool should_schedule = (numPullsReceived == 0);
     numPullsReceived++;
-    memoryFunctionQueue.emplace_back(
-        [this] (int slice_base, Tick schedule_tick) {
-        processNextVertexPull(slice_base, schedule_tick);
-    }, 0, curTick());
-    if ((!nextMemoryEvent.pending()) &&
-        (!nextMemoryEvent.scheduled())) {
-        schedule(nextMemoryEvent, nextCycle());
+
+    if (should_schedule) {
+        memoryFunctionQueue.emplace_back(
+            [this] (int slice_base, Tick schedule_tick) {
+            processNextVertexPull(slice_base, schedule_tick);
+        }, 0, curTick());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
     }
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 641ed327bb..92c28ae11e 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -107,13 +107,14 @@ class CoalesceEngine : public BaseMemoryEngine
 
     int _workCount;
     int numPullsReceived;
+    int startSearchIndex;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
-    std::tuple<bool, int> getOptimalBitVectorSlice();
+    std::tuple<bool, int, Addr> getOptimalPullAddr();
 
     std::unordered_set<Addr> pendingVertexPullReads;
 
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 7b1727587a..63aa474542 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -194,7 +194,9 @@ MPU::sendPacket(PacketPtr pkt)
 void
 MPU::recvDoneSignal()
 {
-    centeralController->recvDoneSignal();
+    if (done()) {
+        centeralController->recvDoneSignal();
+    }
 }
 
 bool

From d57d301f767ea1ed4268b6a6293d7c0c4ee040c5 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 6 Sep 2022 14:21:37 -0700
Subject: [PATCH 155/247] Fixing choosing work in coalesce engine.

---
 src/accl/graph/sega/SConscript         |   2 -
 src/accl/graph/sega/coalesce_engine.cc | 247 ++++++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh |  18 +-
 src/accl/graph/sega/push_engine.cc     |   3 -
 4 files changed, 194 insertions(+), 76 deletions(-)

diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 42a8d84ad5..5d48b46fba 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -43,13 +43,11 @@ Source('wl_engine.cc')
 
 DebugFlag('ApplyUpdates')
 DebugFlag('BaseMemoryEngine')
-DebugFlag('BitVector')
 DebugFlag('CenteralController')
 DebugFlag('CacheBlockState')
 DebugFlag('CoalesceEngine')
 DebugFlag('PushEngine')
 DebugFlag('SEGAStructureSize')
-DebugFlag('TempFlag')
 DebugFlag('WLEngine')
 
 CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 7a52d29c98..cf0e2872f6 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -33,7 +33,6 @@
 #include "accl/graph/sega/mpu.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
-#include "debug/BitVector.hh"
 #include "debug/CacheBlockState.hh"
 #include "debug/CoalesceEngine.hh"
 #include "debug/SEGAStructureSize.hh"
@@ -80,7 +79,7 @@ bool
 CoalesceEngine::done()
 {
     bool push_none = needsPush.none();
-    DPRINTF(CoalesceEngine, "%s: needsPush.none: %s.\n", 
+    DPRINTF(CoalesceEngine, "%s: needsPush.none: %s.\n",
                     __func__, push_none ? "true" : "false");
     return applyQueue.empty() && needsPush.none() &&
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
@@ -428,26 +427,23 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 (cacheBlocks[block_index].valid)));
         // We have read the address to send the wl and it is not in the
         // cache. Simply send the items to the PushEngine.
+
+        DPRINTF(CoalesceEngine, "%s: Received read response for pull read "
+                                "for addr %lu.\n", __func__, addr);
         int it = getBitIndexBase(addr);
-        DPRINTF(CoalesceEngine, "%s: Received read response for retry "
-                    "for addr %lu. It was not found in the cache.\n",
-                    __func__, addr);
+        uint64_t send_mask = pendingVertexPullReads[addr];
         WorkListItem* items = pkt->getPtr<WorkListItem>();
         // No applying of the line needed.
-        DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-                            __func__, needsPush.count());
         for (int i = 0; i < numElementsPerLine; i++) {
             Addr vertex_addr = addr + i * sizeof(WorkListItem);
-            if (needsPush[it + i] == 1) {
-                _workCount--;
+            uint64_t vertex_send_mask = send_mask & (1 << i);
+            if (vertex_send_mask != 0) {
+                assert(needsPush[it + i] == 1);
                 needsPush[it + i] = 0;
+                _workCount--;
                 owner->recvVertexPush(vertex_addr, items[i]);
-                break;
             }
         }
-        DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-                            __func__, needsPush.count());
-
         pendingVertexPullReads.erase(addr);
         delete pkt;
         return true;
@@ -720,6 +716,7 @@ CoalesceEngine::processNextApplyEvent()
                     if (needsPush[bit_index_base + index] == 0) {
                         _workCount++;
                         needsPush[bit_index_base + index] = 1;
+                        activeBits.push_back(bit_index_base + index);
                         if (!owner->running()) {
                             owner->start();
                         }
@@ -888,19 +885,78 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
     }
 }
 
-std::tuple<bool, int, Addr>
+// std::tuple<bool, int, Addr>
+// CoalesceEngine::getOptimalPullAddr()
+// {
+//     int it = startSearchIndex;
+//     int initial_search_index = startSearchIndex;
+//     while (true) {
+//         uint32_t current_popcount = 0;
+//         for (int i = 0; i < numElementsPerLine; i++) {
+//             current_popcount += needsPush[it + i];
+//         }
+//         if (current_popcount != 0) {
+//             Addr addr = getBlockAddrFromBitIndex(it);
+//             int block_index = getBlockIndex(addr);
+//             // Only if it is in cache and it is in idle state.
+//             if ((cacheBlocks[block_index].addr == addr) &&
+//                 (cacheBlocks[block_index].valid) &&
+//                 (cacheBlocks[block_index].busyMask == 0) &&
+//                 (!cacheBlocks[block_index].pendingApply) &&
+//                 (!cacheBlocks[block_index].pendingWB)) {
+//                 assert(!cacheBlocks[block_index].needsApply);
+//                 assert(!cacheBlocks[block_index].pendingData);
+//                 startSearchIndex = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+//                 return std::make_tuple(true, it, addr);
+//             // Otherwise if it is in memory
+//             } else if (cacheBlocks[block_index].addr != addr) {
+//                 if (pendingVertexPullReads.find(addr) !=
+//                             pendingVertexPullReads.end()) {
+//                     startSearchIndex =
+//                                 (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+//                     return std::make_tuple(true, it, addr);
+//                 }
+//             }
+//         }
+//         it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+//         if (it == initial_search_index) {
+//             break;
+//         }
+//     }
+//     // return garbage
+//     return std::make_tuple(false, -1, 0);
+// }
+
+std::tuple<BitStatus, Addr, int>
 CoalesceEngine::getOptimalPullAddr()
 {
-    int it = startSearchIndex;
-    int initial_search_index = startSearchIndex;
-    while (true) {
-        uint32_t current_popcount = 0;
-        for (int i = 0; i < numElementsPerLine; i++) {
-            current_popcount += needsPush[it + i];
-        }
-        if (current_popcount != 0) {
-            Addr addr = getBlockAddrFromBitIndex(it);
-            int block_index = getBlockIndex(addr);
+    int visited_bits = 0;
+    int num_intial_active_bits = activeBits.size();
+    while (visited_bits < num_intial_active_bits) {
+        int index = activeBits.front();
+        int base_index = roundDown<int, int>(index, numElementsPerLine);
+        int index_offset = index - base_index;
+        assert(needsPush[index] == 1);
+        assert(index_offset < numElementsPerLine);
+
+        Addr addr = getBlockAddrFromBitIndex(base_index);
+        int block_index = getBlockIndex(addr);
+        if (pendingVertexPullReads.find(addr) != pendingVertexPullReads.end())
+        {
+            uint64_t send_mask = pendingVertexPullReads[addr];
+            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
+            assert(vertex_send_mask == 0);
+            activeBits.pop_front();
+            return std::make_tuple(
+                                BitStatus::PENDING_READ, addr, index_offset);
+            /*
+            uint64_t send_mask = pendingVertexPullReads[addr];
+            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
+            assert(vertex_send_mask = 0);
+            send_mask |= (1 << index_offset);
+            pendingVertexPullReads[addr] = send_mask;
+            */
+        } else {
             // Only if it is in cache and it is in idle state.
             if ((cacheBlocks[block_index].addr == addr) &&
                 (cacheBlocks[block_index].valid) &&
@@ -909,67 +965,122 @@ CoalesceEngine::getOptimalPullAddr()
                 (!cacheBlocks[block_index].pendingWB)) {
                 assert(!cacheBlocks[block_index].needsApply);
                 assert(!cacheBlocks[block_index].pendingData);
-                startSearchIndex = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-                return std::make_tuple(true, it, addr);
+                activeBits.pop_front();
+                return std::make_tuple(
+                            BitStatus::IN_CACHE, block_index, index_offset);
             // Otherwise if it is in memory
             } else if (cacheBlocks[block_index].addr != addr) {
-                if (pendingVertexPullReads.find(addr) != 
-                                                pendingVertexPullReads.end()) {
-                    startSearchIndex = 
-                                    (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-                    return std::make_tuple(true, it, addr);
-                }
+                activeBits.pop_front();
+                return std::make_tuple(
+                            BitStatus::IN_MEMORY, addr, index_offset);
             }
         }
-        it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-        if (it == initial_search_index) {
-            break;
-        }
+        activeBits.pop_front();
+        activeBits.push_back(index);
+        visited_bits++;
     }
-    // return garbage
-    return std::make_tuple(false, -1, 0); 
+
+    return std::make_tuple(BitStatus::GARBAGE, 0, 0);
 }
 
+// void
+// CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
+// {
+//     bool hit_in_cache;
+//     int slice_base;
+//     Addr addr;
+
+//     std::tie(hit_in_cache, slice_base, addr) = getOptimalPullAddr();
+//     if (slice_base != -1) {
+//         int block_index = getBlockIndex(addr);
+//         if (hit_in_cache) {
+//             assert(cacheBlocks[block_index].valid);
+//             assert(cacheBlocks[block_index].busyMask == 0);
+
+//             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
+//                                     __func__, needsPush.count());
+//             for (int i = 0; i < numElementsPerLine; i++) {
+//                 Addr vertex_addr = addr + i * sizeof(WorkListItem);
+//                 if (needsPush[slice_base + i] == 1) {
+//                     _workCount--;
+//                     needsPush[slice_base + i] = 0;
+//                     owner->recvVertexPush(vertex_addr,
+//                                             cacheBlocks[block_index].items[i]);
+//                     break;
+//                 }
+//             }
+//             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
+//                                     __func__, needsPush.count());
+//         } else {
+//             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+//             SenderState* sender_state = new SenderState(true);
+//             pkt->pushSenderState(sender_state);
+//             memPort.sendPacket(pkt);
+//             onTheFlyReqs++;
+//             pendingVertexPullReads.insert(addr);
+//         }
+//         numPullsReceived--;
+//     }
+
+//     if (numPullsReceived > 0) {
+//         memoryFunctionQueue.emplace_back(
+//             [this] (int slice_base, Tick schedule_tick) {
+//             processNextVertexPull(slice_base, schedule_tick);
+//         }, 0, curTick());
+//         DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
+//                                     "0 to memoryFunctionQueue.\n", __func__);
+//     }
+// }
+
 void
-CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
+CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
-    bool hit_in_cache;
-    int slice_base;
-    Addr addr;
-
-    std::tie(hit_in_cache, slice_base, addr) = getOptimalPullAddr();
-    if (slice_base != -1) {
-        int block_index = getBlockIndex(addr);
-        if (hit_in_cache) {
-            assert(cacheBlocks[block_index].valid);
-            assert(cacheBlocks[block_index].busyMask == 0);
-
-            DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-                                    __func__, needsPush.count());
-            for (int i = 0; i < numElementsPerLine; i++) {
-                Addr vertex_addr = addr + i * sizeof(WorkListItem);
-                if (needsPush[slice_base + i] == 1) {
-                    _workCount--;
-                    needsPush[slice_base + i] = 0;
-                    owner->recvVertexPush(vertex_addr,
-                                            cacheBlocks[block_index].items[i]);
-                    break;
-                }
-            }
-            DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-                                    __func__, needsPush.count());
-        } else {
+    BitStatus bit_status;
+    Addr location;
+    int offset;
+
+    std::tie(bit_status, location, offset) = getOptimalPullAddr();
+
+    if (bit_status != BitStatus::GARBAGE) {
+        if (bit_status == BitStatus::PENDING_READ) {
+            // renaming the outputs to thier local names.
+            Addr addr = location;
+            int index_offset = offset;
+
+            uint64_t send_mask = pendingVertexPullReads[addr];
+            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
+            assert(vertex_send_mask == 0);
+            send_mask |= (1 << index_offset);
+            pendingVertexPullReads[addr] = send_mask;
+        }
+        if (bit_status == BitStatus::IN_CACHE) {
+            // renaming the outputs to their local names.
+            int block_index = (int) location;
+            int wl_offset = offset;
+
+            Addr addr = cacheBlocks[block_index].addr;
+            Addr vertex_addr = addr + (wl_offset * sizeof(WorkListItem));
+            int slice_base_index = getBitIndexBase(addr);
+
+            needsPush[slice_base_index + wl_offset] = 0;
+            _workCount--;
+            owner->recvVertexPush(
+                    vertex_addr, cacheBlocks[block_index].items[wl_offset]);
+        }
+        if (bit_status == BitStatus::IN_MEMORY) {
+            Addr addr = location;
+            int index_offset = offset;
+            uint64_t send_mask = (1 << index_offset);
+            assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
             SenderState* sender_state = new SenderState(true);
             pkt->pushSenderState(sender_state);
             memPort.sendPacket(pkt);
             onTheFlyReqs++;
-
-            pendingVertexPullReads.insert(addr);
+            pendingVertexPullReads[addr] = send_mask;
         }
         numPullsReceived--;
     }
-
     if (numPullsReceived > 0) {
         memoryFunctionQueue.emplace_back(
             [this] (int slice_base, Tick schedule_tick) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 92c28ae11e..fe7c83afb2 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -42,6 +42,14 @@
 namespace gem5
 {
 
+enum BitStatus
+{
+    PENDING_READ,
+    IN_CACHE,
+    IN_MEMORY,
+    GARBAGE
+};
+
 class MPU;
 
 class CoalesceEngine : public BaseMemoryEngine
@@ -107,22 +115,26 @@ class CoalesceEngine : public BaseMemoryEngine
 
     int _workCount;
     int numPullsReceived;
+    // CLEAN: Replace with slice_base_queue
     int startSearchIndex;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
+    std::deque<int> activeBits;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
-    std::tuple<bool, int, Addr> getOptimalPullAddr();
+    std::tuple<BitStatus, Addr, int> getOptimalPullAddr();
 
-    std::unordered_set<Addr> pendingVertexPullReads;
+    // A map from addr to sendMask. sendMask determines which bytes to
+    // send for push when getting the read response from memory.
+    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
     void processNextRead(int block_index, Tick schedule_tick);
     void processNextWriteBack(int block_index, Tick schedule_tick);
-    void processNextVertexPull(int slice_base, Tick schedule_tick);
+    void processNextVertexPull(int ignore, Tick schedule_tick);
     std::deque<std::tuple<
         std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 0134133cfa..505d41b0b8 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -31,7 +31,6 @@
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "debug/PushEngine.hh"
-#include "debug/TempFlag.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
 
@@ -129,8 +128,6 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     edgePointerQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                         peerMemoryAtomSize, addr, (uint32_t) wl.prop);
     numPendingPulls--;
-    DPRINTF(TempFlag, "%s: Received {addr: %lu, wl: %s}.\n",
-                            __func__, addr, wl.to_string());
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
     }

From 8d4f9b0e2bb82986db1d367e03cc6be48140d55c Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 27 Jul 2022 18:36:52 -0700
Subject: [PATCH 156/247] Adding support for synthetic traffic

---
 configs/accl/sega.py | 125 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 116 insertions(+), 9 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 3fa5b99b3a..8e901b6e6d 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -1,8 +1,35 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import m5
+import os
 import argparse
+import subprocess
 
 from math import log
-import math
 from m5.objects import *
 
 def interleave_addresses(plain_range, num_channels, cache_line_size):
@@ -103,21 +130,101 @@ def __init__(self,
 
 def get_inputs():
     argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_mpus", type=int)
+    argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph_path", type=str)
+    argparser.add_argument("vertex_cache_line_size", type=int)
+    argparser.add_argument("synthetic", type=bool)
+    argparser.add_argument("--scale", type=int)
+    argparser.add_argument("--deg", type=int)
+    argparser.add_argument("--graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+
     args = argparser.parse_args()
 
-    return args.num_mpus, args.cache_size, \
-            args.graph_path, args.init_addr, args.init_value
+    if args.synthetic:
+        if (args.scale is None) or (args.deg is None):
+            raise ValueError("If synthetic is true, you should specify the"
+                        "scale of the graph by --scale [scale] and the average"
+                        "degree of the graph by --deg [average degree].")
+    else:
+        if args.graph is None:
+            raise ValueError("If synthetic is false, you should specify the "
+                        "path to graph binaries by --graph [path to graph].")
+    return args
 
 if __name__ == "__m5_main__":
-    num_mpus, cache_size, graph_path, first_addr, first_value = get_inputs()
-
-    print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
-    system = SEGA(num_mpus, cache_size, graph_path, first_addr, first_value)
+    input_args = get_inputs()
+
+    image_path = None
+    if input_args.synthetic:
+        base_dir = os.environ.get("GRAPH_DIR", default="/tmp")
+        graph_gen = os.path.abspath(os.environ.get("GRAPH_GEN"))
+        graph_reader = os.environ.get("GRAPH_READER")
+        graph_sorter = os.environ.get("GRAPH_SORTER")
+        if graph_gen is None:
+            raise ValueError(f"No value for $GRAPH_GEN.")
+        if graph_reader is None:
+            raise ValueError(f"No value for $GRAPH_READER.")
+        if graph_sorter is None:
+            raise ValueError(f"No value for $GRAPH_SORTER")
+
+        graph_path = os.path.join(base_dir, f"graph_{input_args.scale}_{input_args.deg}")
+        if not os.path.exists(graph_path):
+            print(f"{graph_path} does not exist already.")
+            os.mkdir(graph_path)
+            print(f"Created {graph_path}")
+
+        if not "graph.txt" in os.listdir(graph_path):
+            print(f"graph.txt not found in {graph_path}")
+            subprocess.run([f"{graph_gen}",
+                            f"{input_args.scale}",
+                            f"{input_args.deg}",
+                            f"{graph_path}/graph_unordered.txt"])
+            print(f"Generated a graph with scale "
+                f"{input_args.scale} and deg {input_args.deg}")
+            subprocess.run(["python",
+                            f"{graph_sorter}",
+                            f"{graph_path}/graph_unordered.txt",
+                            f"{graph_path}/graph.txt"])
+            print(f"Sorted the graph here {graph_path}/graph_unordered.txt"
+                                    f" and saved in {graph_path}/graph.txt")
+            subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"])
+            print(f"Deleted {graph_path}/graph_unordered.txt")
+
+        if not "binaries" in os.listdir(graph_path):
+            print(f"binaries directory not found in {graph_path}")
+            os.mkdir(f"{graph_path}/binaries")
+            print(f"Created {graph_path}/binaries")
+
+        if not f"gpts_{input_args.num_gpts}" in os.listdir(f"{graph_path}/binaries"):
+            print(f"gpts_{input_args.num_gpts} not found in {graph_path}/binaries")
+            os.mkdir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}")
+            print(f"Created {graph_path}/binaries/gpts_{input_args.num_gpts}")
+
+        expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(input_args.num_gpts)]
+        if not all([binary in os.listdir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}") for binary in expected_bins]):
+            print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{input_args.num_gpts}")
+            for delete in os.scandir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}"):
+                os.remove(delete.path)
+            print(f"Deleted all the files in {graph_path}/binaries/gpts_{input_args.num_gpts}")
+            subprocess.run([f"{graph_reader}" ,
+                            f"{graph_path}/graph.txt",
+                            "false",
+                            f"{input_args.num_gpts}",
+                            f"{input_args.vertex_cache_line_size}",
+                            f"{graph_path}/binaries/gpts_{input_args.num_gpts}"])
+            print(f"Created the graph binaries in "
+                    f"{graph_path}/binaries/n{input_args.num_gpts}")
+        image_path = f"{graph_path}/binaries/gpts_{input_args.num_gpts}"
+    else:
+        image_path = input_args.graph
+
+    system = SEGA(input_args.num_gpts,
+                input_args.cache_size,
+                image_path,
+                input_args.init_addr,
+                input_args.init_value)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()

From 7ddb4cf48879fca09694b983c46ae486bbf97bc2 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 27 Jul 2022 23:42:01 -0700
Subject: [PATCH 157/247] Adding workload as a parameter

---
 configs/accl/sega.py               |  2 +-
 src/accl/graph/sega/PushEngine.py  |  2 ++
 src/accl/graph/sega/WLEngine.py    |  2 ++
 src/accl/graph/sega/push_engine.cc | 17 ++++++++++++++++-
 src/accl/graph/sega/push_engine.hh |  3 ++-
 src/accl/graph/sega/wl_engine.cc   | 19 +++++++++++++++++--
 src/accl/graph/sega/wl_engine.hh   |  5 +++++
 7 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8e901b6e6d..ddeae34e4e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -215,7 +215,7 @@ def get_inputs():
                             f"{input_args.vertex_cache_line_size}",
                             f"{graph_path}/binaries/gpts_{input_args.num_gpts}"])
             print(f"Created the graph binaries in "
-                    f"{graph_path}/binaries/n{input_args.num_gpts}")
+                    f"{graph_path}/binaries/gpts_{input_args.num_gpts}")
         image_path = f"{graph_path}/binaries/gpts_{input_args.num_gpts}"
     else:
         image_path = input_args.graph
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index f98f22ba9d..ad9ddfefcf 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -41,3 +41,5 @@ class PushEngine(BaseMemoryEngine):
     resp_queue_size = Param.Int("Size of the response queue in the "
                                     "push engine where it stores the "
                                     "edges read from memory")
+
+    workload = Param.String("BFS", "Name of the workload")
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 52ca031260..a44352ab9b 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -40,3 +40,5 @@ class WLEngine(BaseReduceEngine):
                                     "WLEngine has. It can service as "
                                     "many updates as this queueu has "
                                     "entries at the same time.") # 4 is arbitrary
+
+    workload = Param.String('BFS',"Name of the workload")
\ No newline at end of file
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 505d41b0b8..9f13c00397 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -42,6 +42,7 @@ PushEngine::PushEngine(const Params& params):
     _running(false),
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
+    workload(params.workload),
     nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
@@ -85,6 +86,20 @@ PushEngine::done()
             edgePointerQueue.empty();
 }
 
+
+uint32_t
+PushEngine::propagate(uint32_t value, uint32_t weight)
+{
+    uint32_t update;
+    if (workload == "BFS")  {
+        update = value + 1;
+    }
+    else{
+        panic("The workload %s is not supported", workload);
+    }
+    return update;
+}
+
 void
 PushEngine::start()
 {
@@ -239,7 +254,7 @@ PushEngine::processNextPushEvent()
                     __func__, curr_edge.to_string());
 
     // TODO: Implement propagate function here
-    uint32_t update_value = curr_edge.value + 1;
+    uint32_t update_value = propagate(value, 1);
     PacketPtr update = createUpdatePacket<uint32_t>(
                             curr_edge.dst, update_value);
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 6f92b62be0..a64a5b1f5b 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -82,7 +82,6 @@ class PushEngine : public BaseMemoryEngine
         Addr src() { return _src; }
         uint32_t value() { return _value; }
     };
-
     struct PushInfo {
         Addr src;
         uint32_t value;
@@ -103,6 +102,8 @@ class PushEngine : public BaseMemoryEngine
     int edgeQueueSize;
     std::deque<std::deque<CompleteEdge>> edgeQueue;
 
+    std::string workload;
+    uint32_t propagate(uint32_t value, uint32_t weight);
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     bool vertexSpace();
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 9890eeed76..855e36b413 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -41,6 +41,7 @@ WLEngine::WLEngine(const WLEngineParams& params):
     BaseReduceEngine(params),
     updateQueueSize(params.update_queue_size),
     registerFileSize(params.register_file_size),
+    workload(params.workload),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)
@@ -58,6 +59,18 @@ WLEngine::done()
     return registerFile.empty() && updateQueue.empty();
 }
 
+uint32_t
+WLEngine::reduce(uint32_t update, uint32_t value)
+{
+    uint32_t new_value;
+    if(workload == "BFS"){
+        new_value = std::min(update, value);
+    } else{
+        panic("Workload not implemented\n");
+    }
+    return new_value;
+}
+
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
@@ -138,7 +151,8 @@ WLEngine::processNextReadEvent()
                     "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
                 __func__, update_addr, update_addr, registerFile[update_addr]);
         registerFile[update_addr] =
-                std::min(update_value, registerFile[update_addr]);
+                    reduce(update_value, registerFile[update_addr]);
+                // std::min(update_value, registerFile[update_addr]);
         DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
                     " registerFile. registerFile[%lu] = %u.\n", __func__,
                     update_value, update_addr, registerFile[update_addr]);
@@ -191,7 +205,8 @@ WLEngine::processNextReduceEvent()
                                         addr, workListFile[addr].to_string());
         // TODO: Generalize this to reduce function rather than just min
         workListFile[addr].tempProp =
-                    std::min(update_value, workListFile[addr].tempProp);
+                    reduce(update_value, workListFile[addr].tempProp);
+                    // std::min(update_value, workListFile[addr].tempProp);
         DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
                             __func__, addr, workListFile[addr].to_string());
         stats.numReduce++;
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 4a0489b123..b03a3cdb87 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -47,6 +47,8 @@ class WLEngine : public BaseReduceEngine
   private:
     MPU* owner;
 
+
+
     int updateQueueSize;
     std::deque<std::tuple<Addr, uint32_t>> updateQueue;
 
@@ -55,6 +57,9 @@ class WLEngine : public BaseReduceEngine
 
     std::unordered_map<Addr, WorkListItem> workListFile;
 
+    std::string workload;
+    uint32_t reduce(uint32_t update, uint32_t value);
+
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 

From 302bc6e3e6be79a515890427c50b765a463441b1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 7 Sep 2022 13:22:40 -0700
Subject: [PATCH 158/247] Adding workload as a parameter to coalesce engine.

---
 src/accl/graph/sega/CoalesceEngine.py  |   5 ++
 src/accl/graph/sega/coalesce_engine.cc | 120 ++++---------------------
 src/accl/graph/sega/coalesce_engine.hh |   5 +-
 src/accl/graph/sega/push_engine.cc     |   2 +-
 src/accl/graph/sega/wl_engine.cc       |   2 -
 5 files changed, 28 insertions(+), 106 deletions(-)

diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 2cc756ff3f..f6e997f1e3 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -35,7 +35,12 @@ class CoalesceEngine(BaseMemoryEngine):
     cxx_class = 'gem5::CoalesceEngine'
 
     cache_size = Param.MemorySize("Size of the internal SRAM array.")
+
     num_mshr_entry = Param.Int("Number of MSHR entries.")
+
     num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
+
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
+
+    workload = Param.String("BFS", "Name of the workload")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index cf0e2872f6..a80d629737 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -49,7 +49,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
-    _workCount(0), numPullsReceived(0),  startSearchIndex(0),
+    _workCount(0), numPullsReceived(0), workload(params.workload),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -85,6 +85,18 @@ CoalesceEngine::done()
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
 
+uint32_t
+CoalesceEngine::reduce(uint32_t update, uint32_t value)
+{
+    uint32_t new_value;
+    if(workload == "BFS"){
+        new_value = std::min(update, value);
+    } else{
+        panic("Workload not implemented\n");
+    }
+    return new_value;
+}
+
 // addr should be aligned to peerMemoryAtomSize
 int
 CoalesceEngine::getBlockIndex(Addr addr)
@@ -700,8 +712,12 @@ CoalesceEngine::processNextApplyEvent()
         assert(cacheBlocks[block_index].busyMask == 0);
         for (int index = 0; index < numElementsPerLine; index++) {
             uint32_t current_prop = cacheBlocks[block_index].items[index].prop;
-            uint32_t new_prop = std::min(current_prop,
-                            cacheBlocks[block_index].items[index].tempProp);
+            // NOTE: It might be the case that for workloads other than BFS,
+            // the reduce function here should be different to the reduce
+            // function defined in WLEngine. Think about the case of PR in
+            // detail.
+            uint32_t new_prop = reduce(
+                cacheBlocks[block_index].items[index].tempProp, current_prop);
             if (new_prop != current_prop) {
                 cacheBlocks[block_index].items[index].tempProp = new_prop;
                 cacheBlocks[block_index].items[index].prop = new_prop;
@@ -885,48 +901,6 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
     }
 }
 
-// std::tuple<bool, int, Addr>
-// CoalesceEngine::getOptimalPullAddr()
-// {
-//     int it = startSearchIndex;
-//     int initial_search_index = startSearchIndex;
-//     while (true) {
-//         uint32_t current_popcount = 0;
-//         for (int i = 0; i < numElementsPerLine; i++) {
-//             current_popcount += needsPush[it + i];
-//         }
-//         if (current_popcount != 0) {
-//             Addr addr = getBlockAddrFromBitIndex(it);
-//             int block_index = getBlockIndex(addr);
-//             // Only if it is in cache and it is in idle state.
-//             if ((cacheBlocks[block_index].addr == addr) &&
-//                 (cacheBlocks[block_index].valid) &&
-//                 (cacheBlocks[block_index].busyMask == 0) &&
-//                 (!cacheBlocks[block_index].pendingApply) &&
-//                 (!cacheBlocks[block_index].pendingWB)) {
-//                 assert(!cacheBlocks[block_index].needsApply);
-//                 assert(!cacheBlocks[block_index].pendingData);
-//                 startSearchIndex = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-//                 return std::make_tuple(true, it, addr);
-//             // Otherwise if it is in memory
-//             } else if (cacheBlocks[block_index].addr != addr) {
-//                 if (pendingVertexPullReads.find(addr) !=
-//                             pendingVertexPullReads.end()) {
-//                     startSearchIndex =
-//                                 (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-//                     return std::make_tuple(true, it, addr);
-//                 }
-//             }
-//         }
-//         it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-//         if (it == initial_search_index) {
-//             break;
-//         }
-//     }
-//     // return garbage
-//     return std::make_tuple(false, -1, 0);
-// }
-
 std::tuple<BitStatus, Addr, int>
 CoalesceEngine::getOptimalPullAddr()
 {
@@ -949,13 +923,6 @@ CoalesceEngine::getOptimalPullAddr()
             activeBits.pop_front();
             return std::make_tuple(
                                 BitStatus::PENDING_READ, addr, index_offset);
-            /*
-            uint64_t send_mask = pendingVertexPullReads[addr];
-            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
-            assert(vertex_send_mask = 0);
-            send_mask |= (1 << index_offset);
-            pendingVertexPullReads[addr] = send_mask;
-            */
         } else {
             // Only if it is in cache and it is in idle state.
             if ((cacheBlocks[block_index].addr == addr) &&
@@ -983,55 +950,6 @@ CoalesceEngine::getOptimalPullAddr()
     return std::make_tuple(BitStatus::GARBAGE, 0, 0);
 }
 
-// void
-// CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
-// {
-//     bool hit_in_cache;
-//     int slice_base;
-//     Addr addr;
-
-//     std::tie(hit_in_cache, slice_base, addr) = getOptimalPullAddr();
-//     if (slice_base != -1) {
-//         int block_index = getBlockIndex(addr);
-//         if (hit_in_cache) {
-//             assert(cacheBlocks[block_index].valid);
-//             assert(cacheBlocks[block_index].busyMask == 0);
-
-//             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-//                                     __func__, needsPush.count());
-//             for (int i = 0; i < numElementsPerLine; i++) {
-//                 Addr vertex_addr = addr + i * sizeof(WorkListItem);
-//                 if (needsPush[slice_base + i] == 1) {
-//                     _workCount--;
-//                     needsPush[slice_base + i] = 0;
-//                     owner->recvVertexPush(vertex_addr,
-//                                             cacheBlocks[block_index].items[i]);
-//                     break;
-//                 }
-//             }
-//             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-//                                     __func__, needsPush.count());
-//         } else {
-//             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
-//             SenderState* sender_state = new SenderState(true);
-//             pkt->pushSenderState(sender_state);
-//             memPort.sendPacket(pkt);
-//             onTheFlyReqs++;
-//             pendingVertexPullReads.insert(addr);
-//         }
-//         numPullsReceived--;
-//     }
-
-//     if (numPullsReceived > 0) {
-//         memoryFunctionQueue.emplace_back(
-//             [this] (int slice_base, Tick schedule_tick) {
-//             processNextVertexPull(slice_base, schedule_tick);
-//         }, 0, curTick());
-//         DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
-//                                     "0 to memoryFunctionQueue.\n", __func__);
-//     }
-// }
-
 void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index fe7c83afb2..7503d69b76 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -115,8 +115,6 @@ class CoalesceEngine : public BaseMemoryEngine
 
     int _workCount;
     int numPullsReceived;
-    // CLEAN: Replace with slice_base_queue
-    int startSearchIndex;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
     std::deque<int> activeBits;
@@ -130,6 +128,9 @@ class CoalesceEngine : public BaseMemoryEngine
     // send for push when getting the read response from memory.
     std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
+    std::string workload;
+    uint32_t reduce(uint32_t update, uint32_t value);
+
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
     void processNextRead(int block_index, Tick schedule_tick);
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 9f13c00397..625f836561 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -254,7 +254,7 @@ PushEngine::processNextPushEvent()
                     __func__, curr_edge.to_string());
 
     // TODO: Implement propagate function here
-    uint32_t update_value = propagate(value, 1);
+    uint32_t update_value = propagate(curr_edge.value, curr_edge.weight);
     PacketPtr update = createUpdatePacket<uint32_t>(
                             curr_edge.dst, update_value);
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 855e36b413..5465769cff 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -152,7 +152,6 @@ WLEngine::processNextReadEvent()
                 __func__, update_addr, update_addr, registerFile[update_addr]);
         registerFile[update_addr] =
                     reduce(update_value, registerFile[update_addr]);
-                // std::min(update_value, registerFile[update_addr]);
         DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
                     " registerFile. registerFile[%lu] = %u.\n", __func__,
                     update_value, update_addr, registerFile[update_addr]);
@@ -206,7 +205,6 @@ WLEngine::processNextReduceEvent()
         // TODO: Generalize this to reduce function rather than just min
         workListFile[addr].tempProp =
                     reduce(update_value, workListFile[addr].tempProp);
-                    // std::min(update_value, workListFile[addr].tempProp);
         DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
                             __func__, addr, workListFile[addr].to_string());
         stats.numReduce++;

From ab2362a81cfec8311e017d824c9d6208beec235d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 8 Sep 2022 10:20:48 -0700
Subject: [PATCH 159/247] Adding stats.

---
 configs/accl/sega.py                   |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 21 ++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh |  4 +++-
 src/accl/graph/sega/push_engine.cc     |  7 ++++++-
 src/accl/graph/sega/push_engine.hh     |  2 ++
 src/accl/graph/sega/wl_engine.cc       |  9 ++++++++-
 src/accl/graph/sega/wl_engine.hh       |  1 +
 7 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index ddeae34e4e..e8d76e7dad 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -159,7 +159,7 @@ def get_inputs():
     image_path = None
     if input_args.synthetic:
         base_dir = os.environ.get("GRAPH_DIR", default="/tmp")
-        graph_gen = os.path.abspath(os.environ.get("GRAPH_GEN"))
+        graph_gen = os.environ.get("GRAPH_GEN")
         graph_reader = os.environ.get("GRAPH_READER")
         graph_sorter = os.environ.get("GRAPH_SORTER")
         if graph_gen is None:
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index a80d629737..dbe5e56f2d 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -210,7 +210,7 @@ CoalesceEngine::recvWLRead(Addr addr)
             DPRINTF(CoalesceEngine,  "%s: Out of targets for "
                         "cacheBlocks[%d]. Rejecting request.\n",
                                         __func__, block_index);
-            stats.readRejections++;
+            stats.mshrTargetShortage++;
             return false;
         } else {
             DPRINTF(CoalesceEngine,  "%s: MSHR entries are available for "
@@ -241,7 +241,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                                 "Rejecting request.\n", __func__);
                 // TODO: Break out read rejections into more than one stat
                 // based on the cause of the rejection
-                stats.readRejections++;
+                stats.mshrEntryShortage++;
                 return false;
             } else {
                 DPRINTF(CoalesceEngine,  "%s: MSHR "
@@ -399,7 +399,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                 DPRINTF(CoalesceEngine,  "%s: Out of targets for "
                             "cacheBlocks[%d]. Rejecting request.\n",
                                             __func__, block_index);
-                stats.readRejections++;
+                stats.mshrTargetShortage++;
                 return false;
             }
             DPRINTF(CoalesceEngine, "%s: There is room for another target "
@@ -740,6 +740,8 @@ CoalesceEngine::processNextApplyEvent()
                 }
             }
         }
+        stats.bitvectorLength.sample(needsPush.count());
+
         cacheBlocks[block_index].needsWB = true;
         cacheBlocks[block_index].needsApply = false;
         cacheBlocks[block_index].pendingApply = false;
@@ -1055,12 +1057,16 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache misses."),
     ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
              "Number of cache hit under misses."),
-    ADD_STAT(readRejections, statistics::units::Count::get(),
-             "Number of cache rejections."),
+    ADD_STAT(mshrEntryShortage, statistics::units::Count::get(),
+             "Number of cache rejections caused by entry shortage."),
+    ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
+             "Number of cache rejections caused by target shortage."),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
     ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
-             "Histogram on the length of the mshr entries.")
+             "Histogram on the length of the mshr entries."),
+    ADD_STAT(bitvectorLength, statistics::units::Count::get(),
+             "Histogram of the length of the bitvector")
 {
 }
 
@@ -1069,7 +1075,8 @@ CoalesceEngine::CoalesceStats::regStats()
 {
     using namespace statistics;
 
-    mshrEntryLength.init(64);
+    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
+    bitvectorLength.init(64);
 
     hitRate = (readHits + readHitUnderMisses) /
                 (readHits + readHitUnderMisses + readMisses);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 7503d69b76..16c417fc60 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -158,10 +158,12 @@ class CoalesceEngine : public BaseMemoryEngine
       statistics::Scalar readHits;
       statistics::Scalar readMisses;
       statistics::Scalar readHitUnderMisses;
-      statistics::Scalar readRejections;
+      statistics::Scalar mshrEntryShortage;
+      statistics::Scalar mshrTargetShortage;
 
       statistics::Formula hitRate;
       statistics::Histogram mshrEntryLength;
+      statistics::Histogram bitvectorLength;
     };
 
     CoalesceStats stats;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 625f836561..855d666989 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -300,7 +300,10 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
     ADD_STAT(numUpdates, statistics::units::Count::get(),
-             "Number of sent updates.")
+             "Number of sent updates."),
+    ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
+                                    statistics::units::Second>::get(),
+             "Traversed Edges Per Second.")
 {
 }
 
@@ -308,6 +311,8 @@ void
 PushEngine::PushStats::regStats()
 {
     using namespace statistics;
+
+    TEPS = numUpdates / simSeconds;
 }
 
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index a64a5b1f5b..a5677067b8 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -127,6 +127,8 @@ class PushEngine : public BaseMemoryEngine
       PushEngine &push;
 
       statistics::Scalar numUpdates;
+
+      statistics::Formula TEPS;
     };
 
     PushStats stats;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 5465769cff..a39905037e 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -144,6 +144,10 @@ WLEngine::processNextReadEvent()
                             update_value, updateQueue.size(), updateQueueSize);
                 owner->checkRetryReq();
             }
+        } else {
+            DPRINTF(WLEngine, "%s: There are no free registers "
+                    "available in the registerFile.\n", __func__);
+            stats.registerShortage++;
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
@@ -231,7 +235,10 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
     ADD_STAT(numReduce, statistics::units::Count::get(),
              "Number of memory blocks read for vertecies"),
     ADD_STAT(registerFileCoalesce, statistics::units::Count::get(),
-             "Number of memory blocks read for vertecies")
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(registerShortage, statistics::units::Count::get(),
+             "Number of times updates were "
+             "stalled because of register shortage")
 {
 }
 
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index b03a3cdb87..2956e58666 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -76,6 +76,7 @@ class WLEngine : public BaseReduceEngine
 
       statistics::Scalar numReduce;
       statistics::Scalar registerFileCoalesce;
+      statistics::Scalar registerShortage;
     };
 
     WorkListStats stats;

From 40b01f05558c798a20e60b26822d9ca8241b47eb Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 11 Sep 2022 14:39:42 -0700
Subject: [PATCH 160/247] Separating graph generation from run script.

---
 configs/accl/graph-gen.py | 103 ++++++++++++++++++++++++++++++++++++++
 configs/accl/sega.py      |  96 +++--------------------------------
 2 files changed, 110 insertions(+), 89 deletions(-)
 create mode 100644 configs/accl/graph-gen.py

diff --git a/configs/accl/graph-gen.py b/configs/accl/graph-gen.py
new file mode 100644
index 0000000000..16985b3537
--- /dev/null
+++ b/configs/accl/graph-gen.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import argparse
+import subprocess
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("scale", type=int, help="The scale of the synth graph to generate.")
+    argparser.add_argument("deg", type=int, help="The average degree of the synth graph to generate.")
+    argparser.add_argument("num_gpts", type=int, help="Number gpts to create synth graph binaries for.")
+
+    args = argparser.parse_args()
+    return args.scale, args.deg, args.num_gpts
+
+if __name__ == "__main__":
+    scale, deg, num_gpts = get_inputs()
+
+    base_dir = os.environ.get("GRAPH_DIR", default="/tmp")
+    graph_gen = os.environ.get("GRAPH_GEN")
+    graph_reader = os.environ.get("GRAPH_READER")
+    graph_sorter = os.environ.get("GRAPH_SORTER")
+    if graph_gen is None:
+        raise ValueError(f"No value for $GRAPH_GEN.")
+    if graph_reader is None:
+        raise ValueError(f"No value for $GRAPH_READER.")
+    if graph_sorter is None:
+        raise ValueError(f"No value for $GRAPH_SORTER")
+
+    graph_path = os.path.join(base_dir, f"graph_{scale}_{deg}")
+    if not os.path.exists(graph_path):
+        print(f"{graph_path} does not exist already.")
+        os.mkdir(graph_path)
+        print(f"Created {graph_path}")
+
+    if not "graph.txt" in os.listdir(graph_path):
+        print(f"graph.txt not found in {graph_path}")
+        for delete in os.scandir(graph_path):
+            os.remove(delete.path)
+        print(f"Deleted everything in {graph_path}")
+        subprocess.run([f"{graph_gen}",
+                        f"{scale}",
+                        f"{deg}",
+                        f"{graph_path}/graph_unordered.txt"])
+        print(f"Generated a graph with scale "
+            f"{scale} and deg {deg}")
+        subprocess.run(["python",
+                        f"{graph_sorter}",
+                        f"{graph_path}/graph_unordered.txt",
+                        f"{graph_path}/graph.txt"])
+        print(f"Sorted the graph here {graph_path}/graph_unordered.txt"
+                                f" and saved in {graph_path}/graph.txt")
+        subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"])
+        print(f"Deleted {graph_path}/graph_unordered.txt")
+
+    if not "binaries" in os.listdir(graph_path):
+        print(f"binaries directory not found in {graph_path}")
+        os.mkdir(f"{graph_path}/binaries")
+        print(f"Created {graph_path}/binaries")
+
+    if not f"gpts_{num_gpts}" in os.listdir(f"{graph_path}/binaries"):
+        print(f"gpts_{num_gpts} not found in {graph_path}/binaries")
+        os.mkdir(f"{graph_path}/binaries/gpts_{num_gpts}")
+        print(f"Created {graph_path}/binaries/gpts_{num_gpts}")
+
+    expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
+    if not all([binary in os.listdir(f"{graph_path}/binaries/gpts_{num_gpts}") for binary in expected_bins]):
+        print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}")
+        for delete in os.scandir(f"{graph_path}/binaries/gpts_{num_gpts}"):
+            os.remove(delete.path)
+        print(f"Deleted all the files in {graph_path}/binaries/gpts_{num_gpts}")
+        subprocess.run([f"{graph_reader}" ,
+                        f"{graph_path}/graph.txt",
+                        "false",
+                        f"{num_gpts}",
+                        "32",
+                        f"{graph_path}/binaries/gpts_{num_gpts}"])
+        print(f"Created the graph binaries in "
+                f"{graph_path}/binaries/gpts_{num_gpts}")
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index e8d76e7dad..10f7ea2b48 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -25,9 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import m5
-import os
 import argparse
-import subprocess
 
 from math import log
 from m5.objects import *
@@ -49,7 +47,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(update_queue_size=32,
+        self.wl_engine = WLEngine(update_queue_size=64,
                                 register_file_size=32)
         self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
                                             cache_size=cache_size,
@@ -132,99 +130,19 @@ def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("vertex_cache_line_size", type=int)
-    argparser.add_argument("synthetic", type=bool)
-    argparser.add_argument("--scale", type=int)
-    argparser.add_argument("--deg", type=int)
-    argparser.add_argument("--graph", type=str)
+    argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
 
     args = argparser.parse_args()
 
-    if args.synthetic:
-        if (args.scale is None) or (args.deg is None):
-            raise ValueError("If synthetic is true, you should specify the"
-                        "scale of the graph by --scale [scale] and the average"
-                        "degree of the graph by --deg [average degree].")
-    else:
-        if args.graph is None:
-            raise ValueError("If synthetic is false, you should specify the "
-                        "path to graph binaries by --graph [path to graph].")
-    return args
+    return args.num_gpts, args.cache_size, \
+        args.graph, args.init_addr, args.init_value
 
 if __name__ == "__m5_main__":
-    input_args = get_inputs()
-
-    image_path = None
-    if input_args.synthetic:
-        base_dir = os.environ.get("GRAPH_DIR", default="/tmp")
-        graph_gen = os.environ.get("GRAPH_GEN")
-        graph_reader = os.environ.get("GRAPH_READER")
-        graph_sorter = os.environ.get("GRAPH_SORTER")
-        if graph_gen is None:
-            raise ValueError(f"No value for $GRAPH_GEN.")
-        if graph_reader is None:
-            raise ValueError(f"No value for $GRAPH_READER.")
-        if graph_sorter is None:
-            raise ValueError(f"No value for $GRAPH_SORTER")
-
-        graph_path = os.path.join(base_dir, f"graph_{input_args.scale}_{input_args.deg}")
-        if not os.path.exists(graph_path):
-            print(f"{graph_path} does not exist already.")
-            os.mkdir(graph_path)
-            print(f"Created {graph_path}")
-
-        if not "graph.txt" in os.listdir(graph_path):
-            print(f"graph.txt not found in {graph_path}")
-            subprocess.run([f"{graph_gen}",
-                            f"{input_args.scale}",
-                            f"{input_args.deg}",
-                            f"{graph_path}/graph_unordered.txt"])
-            print(f"Generated a graph with scale "
-                f"{input_args.scale} and deg {input_args.deg}")
-            subprocess.run(["python",
-                            f"{graph_sorter}",
-                            f"{graph_path}/graph_unordered.txt",
-                            f"{graph_path}/graph.txt"])
-            print(f"Sorted the graph here {graph_path}/graph_unordered.txt"
-                                    f" and saved in {graph_path}/graph.txt")
-            subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"])
-            print(f"Deleted {graph_path}/graph_unordered.txt")
-
-        if not "binaries" in os.listdir(graph_path):
-            print(f"binaries directory not found in {graph_path}")
-            os.mkdir(f"{graph_path}/binaries")
-            print(f"Created {graph_path}/binaries")
-
-        if not f"gpts_{input_args.num_gpts}" in os.listdir(f"{graph_path}/binaries"):
-            print(f"gpts_{input_args.num_gpts} not found in {graph_path}/binaries")
-            os.mkdir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}")
-            print(f"Created {graph_path}/binaries/gpts_{input_args.num_gpts}")
-
-        expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(input_args.num_gpts)]
-        if not all([binary in os.listdir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}") for binary in expected_bins]):
-            print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{input_args.num_gpts}")
-            for delete in os.scandir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}"):
-                os.remove(delete.path)
-            print(f"Deleted all the files in {graph_path}/binaries/gpts_{input_args.num_gpts}")
-            subprocess.run([f"{graph_reader}" ,
-                            f"{graph_path}/graph.txt",
-                            "false",
-                            f"{input_args.num_gpts}",
-                            f"{input_args.vertex_cache_line_size}",
-                            f"{graph_path}/binaries/gpts_{input_args.num_gpts}"])
-            print(f"Created the graph binaries in "
-                    f"{graph_path}/binaries/gpts_{input_args.num_gpts}")
-        image_path = f"{graph_path}/binaries/gpts_{input_args.num_gpts}"
-    else:
-        image_path = input_args.graph
-
-    system = SEGA(input_args.num_gpts,
-                input_args.cache_size,
-                image_path,
-                input_args.init_addr,
-                input_args.init_value)
+    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph, init_addr, init_value)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()

From 6124b008976c8797d0b330815f9b04579abf42ce Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 12 Sep 2022 15:25:11 -0700
Subject: [PATCH 161/247] Adding new stats.

---
 src/accl/graph/sega/coalesce_engine.cc | 13 ++++++++++++-
 src/accl/graph/sega/coalesce_engine.hh |  2 ++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index dbe5e56f2d..7646ba8862 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -834,9 +834,13 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
                                     peerMemoryAtomSize);
     DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
             "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
-
     memPort.sendPacket(pkt);
     onTheFlyReqs++;
+
+    if (pendingVertexPullReads.find(pkt->getAddr()) !=
+        pendingVertexPullReads.end()) {
+        stats.numDoubleMemReads++;
+    }
 }
 
 void
@@ -1000,6 +1004,8 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             pendingVertexPullReads[addr] = send_mask;
         }
         numPullsReceived--;
+    } else {
+        stats.workSearchFails++;
     }
     if (numPullsReceived > 0) {
         memoryFunctionQueue.emplace_back(
@@ -1061,6 +1067,11 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache rejections caused by entry shortage."),
     ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
              "Number of cache rejections caused by target shortage."),
+    ADD_STAT(workSearchFails, statistics::units::Count::get(),
+             "Number of times coalesce engine fails to find work to push."),
+    ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
+             "Number of times a memory block has been read twice. "
+             "Once for push and once to populate the cache."),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
     ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 16c417fc60..355eaad07d 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -160,6 +160,8 @@ class CoalesceEngine : public BaseMemoryEngine
       statistics::Scalar readHitUnderMisses;
       statistics::Scalar mshrEntryShortage;
       statistics::Scalar mshrTargetShortage;
+      statistics::Scalar workSearchFails;
+      statistics::Scalar numDoubleMemReads;
 
       statistics::Formula hitRate;
       statistics::Histogram mshrEntryLength;

From 655902315cc2a07658100ebbdc568cb59523ef85 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 13 Sep 2022 21:44:54 -0700
Subject: [PATCH 162/247] Fixing sconscript style.

---
 src/accl/graph/base/SConscript |  6 ++---
 src/accl/graph/sega/SConscript | 44 +++++++++++++++++-----------------
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 0e43d1aed8..8b741abfc8 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -25,8 +25,8 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-Import('*')
+Import("*")
 
-SimObject('BaseReduceEngine.py')
+SimObject("BaseReduceEngine.py", sim_objects=["BaseReduceEngine"])
 
-Source('base_reduce_engine.cc')
+Source("base_reduce_engine.cc")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 5d48b46fba..f16d025ca2 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -25,30 +25,30 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-Import('*')
+Import("*")
 
-SimObject('BaseMemoryEngine.py')
-SimObject('CenteralController.py')
-SimObject('CoalesceEngine.py')
-SimObject("MPU.py")
-SimObject('PushEngine.py')
-SimObject('WLEngine.py')
+SimObject("BaseMemoryEngine.py", sim_objects=["BaseMemoryEngine"])
+SimObject("CenteralController.py", sim_objects=["CenteralController"])
+SimObject("CoalesceEngine.py", sim_objects=["CoalesceEngine"])
+SimObject("MPU.py", sim_objects=["MPU"])
+SimObject("PushEngine.py", sim_objects=["PushEngine"])
+SimObject("WLEngine.py", sim_objects=["WLEngine"])
 
-Source('base_memory_engine.cc')
-Source('centeral_controller.cc')
-Source('coalesce_engine.cc')
+Source("base_memory_engine.cc")
+Source("centeral_controller.cc")
+Source("coalesce_engine.cc")
 Source("mpu.cc")
-Source('push_engine.cc')
-Source('wl_engine.cc')
+Source("push_engine.cc")
+Source("wl_engine.cc")
 
-DebugFlag('ApplyUpdates')
-DebugFlag('BaseMemoryEngine')
-DebugFlag('CenteralController')
-DebugFlag('CacheBlockState')
-DebugFlag('CoalesceEngine')
-DebugFlag('PushEngine')
-DebugFlag('SEGAStructureSize')
-DebugFlag('WLEngine')
+DebugFlag("ApplyUpdates")
+DebugFlag("BaseMemoryEngine")
+DebugFlag("CenteralController")
+DebugFlag("CacheBlockState")
+DebugFlag("CoalesceEngine")
+DebugFlag("PushEngine")
+DebugFlag("SEGAStructureSize")
+DebugFlag("WLEngine")
 
-CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
-                    'WLEngine', 'BaseMemoryEngine'])
\ No newline at end of file
+CompoundFlag("MPU", ["CoalesceEngine", "PushEngine",
+                    "WLEngine", "BaseMemoryEngine"])
\ No newline at end of file

From 489e914deb132f3b81cd0b31ff0254226aa08db9 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 15 Sep 2022 11:16:25 -0700
Subject: [PATCH 163/247] Adding stats for measuring push and pull rate.

---
 configs/accl/sega.py                   | 21 ++++++++-----
 src/accl/graph/sega/coalesce_engine.cc | 34 ++++++++++++++++++++-
 src/accl/graph/sega/coalesce_engine.hh | 41 ++++++++++++++++----------
 3 files changed, 72 insertions(+), 24 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 10f7ea2b48..2a92ee1769 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -61,8 +61,8 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
 
         self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
-                                            range=AddrRange(edge_memory_size),
-                                            in_addr_map=False))
+                                    range=AddrRange(edge_memory_size),
+                                    in_addr_map=False))
 
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
@@ -92,7 +92,8 @@ def __init__(self,
                 cache_size,
                 graph_path,
                 first_addr,
-                first_value):
+                first_value
+                ):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
@@ -103,16 +104,20 @@ def __init__(self,
         self.interconnect = NoncoherentXBar(frontend_latency=1,
                                             forward_latency=1,
                                             response_latency=1,
-                                            width=64)
+                                            width=64
+                                            )
 
         self.ctrl = CenteralController(addr=first_addr, value=first_value,
-                                    image_file=f"{graph_path}/vertices")
+                                       image_file=f"{graph_path}/vertices"
+                                        )
+
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
         vertex_ranges = interleave_addresses(
-                            AddrRange(start=0, size="4GiB"),
-                            num_mpus,
-                            32)
+                                            AddrRange(start=0, size="4GiB"),
+                                            num_mpus,
+                                            32
+                                            )
 
         gpts = []
         for i in range(num_mpus):
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 7646ba8862..5f1e849660 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -454,6 +454,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 needsPush[it + i] = 0;
                 _workCount--;
                 owner->recvVertexPush(vertex_addr, items[i]);
+                stats.verticesPushed++;
+                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
             }
         }
         pendingVertexPullReads.erase(addr);
@@ -990,6 +992,8 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             _workCount--;
             owner->recvVertexPush(
                     vertex_addr, cacheBlocks[block_index].items[wl_offset]);
+            stats.verticesPushed++;
+            stats.lastVertexPushTime = curTick() - stats.lastResetTick;
         }
         if (bit_status == BitStatus::IN_MEMORY) {
             Addr addr = location;
@@ -1037,6 +1041,8 @@ CoalesceEngine::recvVertexPull()
     bool should_schedule = (numPullsReceived == 0);
     numPullsReceived++;
 
+    stats.verticesPulled++;
+    stats.lastVertexPullTime = curTick() - stats.lastResetTick;
     if (should_schedule) {
         memoryFunctionQueue.emplace_back(
             [this] (int slice_base, Tick schedule_tick) {
@@ -1052,7 +1058,7 @@ CoalesceEngine::recvVertexPull()
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
-
+    lastResetTick(0),
     ADD_STAT(numVertexReads, statistics::units::Count::get(),
              "Number of memory vertecies read from cache."),
     ADD_STAT(numVertexWrites, statistics::units::Count::get(),
@@ -1072,8 +1078,22 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
              "Number of times a memory block has been read twice. "
              "Once for push and once to populate the cache."),
+    ADD_STAT(verticesPulled, statistics::units::Count::get(),
+             "Number of times a pull request has been sent by PushEngine."),
+    ADD_STAT(verticesPushed, statistics::units::Count::get(),
+             "Number of times a vertex has been pushed to the PushEngine"),
+    ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(),
+             "Time of the last pull request. (Relative to reset_stats)"),
+    ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
+             "Time of the last vertex push. (Relative to reset_stats)"),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
+    ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which pull requests arrive."),
+    ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which vertices are pushed."),
     ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
              "Histogram on the length of the mshr entries."),
     ADD_STAT(bitvectorLength, statistics::units::Count::get(),
@@ -1091,6 +1111,18 @@ CoalesceEngine::CoalesceStats::regStats()
 
     hitRate = (readHits + readHitUnderMisses) /
                 (readHits + readHitUnderMisses + readMisses);
+
+    vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime;
+
+    vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
+}
+
+void
+CoalesceEngine::CoalesceStats::resetStats()
+{
+    statistics::Group::resetStats();
+
+    lastResetTick = curTick();
 }
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 355eaad07d..8190478a1b 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -147,25 +147,36 @@ class CoalesceEngine : public BaseMemoryEngine
 
     struct CoalesceStats : public statistics::Group
     {
-      CoalesceStats(CoalesceEngine &coalesce);
+        CoalesceStats(CoalesceEngine &coalesce);
 
-      void regStats() override;
+        virtual void regStats() override;
 
-      CoalesceEngine &coalesce;
+        virtual void resetStats() override;
 
-      statistics::Scalar numVertexReads;
-      statistics::Scalar numVertexWrites;
-      statistics::Scalar readHits;
-      statistics::Scalar readMisses;
-      statistics::Scalar readHitUnderMisses;
-      statistics::Scalar mshrEntryShortage;
-      statistics::Scalar mshrTargetShortage;
-      statistics::Scalar workSearchFails;
-      statistics::Scalar numDoubleMemReads;
+        CoalesceEngine &coalesce;
 
-      statistics::Formula hitRate;
-      statistics::Histogram mshrEntryLength;
-      statistics::Histogram bitvectorLength;
+        Tick lastResetTick;
+
+        statistics::Scalar numVertexReads;
+        statistics::Scalar numVertexWrites;
+        statistics::Scalar readHits;
+        statistics::Scalar readMisses;
+        statistics::Scalar readHitUnderMisses;
+        statistics::Scalar mshrEntryShortage;
+        statistics::Scalar mshrTargetShortage;
+        statistics::Scalar workSearchFails;
+        statistics::Scalar numDoubleMemReads;
+        statistics::Scalar verticesPulled;
+        statistics::Scalar verticesPushed;
+        statistics::Scalar lastVertexPullTime;
+        statistics::Scalar lastVertexPushTime;
+
+        statistics::Formula hitRate;
+        statistics::Formula vertexPullBW;
+        statistics::Formula vertexPushBW;
+
+        statistics::Histogram mshrEntryLength;
+        statistics::Histogram bitvectorLength;
     };
 
     CoalesceStats stats;

From b297c794e5c08daa6be9727b554687507594a034 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 16 Sep 2022 14:18:57 -0700
Subject: [PATCH 164/247] Added FinalAnswer debugFlag and answer printing.

---
 configs/accl/sega.py                       |  8 ++--
 src/accl/graph/sega/CenteralController.py  |  4 +-
 src/accl/graph/sega/SConscript             |  2 +-
 src/accl/graph/sega/base_memory_engine.hh  |  2 +-
 src/accl/graph/sega/centeral_controller.cc | 43 ++++++++++++++++++----
 src/accl/graph/sega/centeral_controller.hh |  7 ++--
 src/accl/graph/sega/coalesce_engine.cc     | 36 ++++++++++++++----
 src/accl/graph/sega/coalesce_engine.hh     |  2 +
 src/accl/graph/sega/push_engine.hh         |  2 +
 9 files changed, 82 insertions(+), 24 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 2a92ee1769..7b37742cdb 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -107,9 +107,11 @@ def __init__(self,
                                             width=64
                                             )
 
-        self.ctrl = CenteralController(addr=first_addr, value=first_value,
-                                       image_file=f"{graph_path}/vertices"
-                                        )
+        self.ctrl = CenteralController(
+                                    init_addr=first_addr,
+                                    init_value=first_value,
+                                    image_file=f"{graph_path}/vertices"
+                                    )
 
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 6f6b12ea2c..9bee76511d 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -39,6 +39,6 @@ class CenteralController(ClockedObject):
 
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
-    addr = Param.Addr("The addr for the initial update")
-    value = Param.Int("The value for the initial update")
+    init_addr = Param.Addr("The addr for the initial update")
+    init_value = Param.Int("The value for the initial update")
     image_file = Param.String("Path to the global memory image.")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index f16d025ca2..5d411be9ac 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -41,11 +41,11 @@ Source("mpu.cc")
 Source("push_engine.cc")
 Source("wl_engine.cc")
 
-DebugFlag("ApplyUpdates")
 DebugFlag("BaseMemoryEngine")
 DebugFlag("CenteralController")
 DebugFlag("CacheBlockState")
 DebugFlag("CoalesceEngine")
+DebugFlag("FinalAnswer")
 DebugFlag("PushEngine")
 DebugFlag("SEGAStructureSize")
 DebugFlag("WLEngine")
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
index f336edcbf1..afe7fd0433 100644
--- a/src/accl/graph/sega/base_memory_engine.hh
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -108,7 +108,7 @@ class BaseMemoryEngine : public ClockedObject
 
     AddrRangeList getAddrRanges() { return memPort.getAddrRanges(); }
 
-    void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+    virtual void recvFunctional(PacketPtr pkt) = 0;
 
     virtual void init() override;
 };
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 5ce7228abb..c6de1d8390 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -28,8 +28,6 @@
 
 #include "accl/graph/sega/centeral_controller.hh"
 
-#include <iostream>
-
 #include "base/loader/memory_image.hh"
 #include "base/loader/object_file.hh"
 #include "debug/CenteralController.hh"
@@ -44,8 +42,7 @@ CenteralController::CenteralController
     ClockedObject(params),
     system(params.system),
     reqPort(name() + ".req_port", this),
-    addr(params.addr),
-    value(params.value)
+    maxVertexAddr(0)
 {
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
@@ -66,9 +63,9 @@ CenteralController::getPort(const std::string &if_name, PortID idx)
 void
 CenteralController::initState()
 {
-    ClockedObject::initState();
+    // ClockedObject::initState();
 
-    const auto &file = params().image_file;
+    const auto& file = params().image_file;
     if (file == "")
         return;
 
@@ -77,6 +74,7 @@ CenteralController::initState()
 
     loader::debugSymbolTable.insert(*object->symtab().globals());
     loader::MemoryImage image = object->buildImage();
+    maxVertexAddr = image.maxAddr();
     PortProxy proxy([this](PacketPtr pkt) { functionalAccess(pkt); },
                     system->cacheLineSize());
 
@@ -86,7 +84,10 @@ CenteralController::initState()
 void
 CenteralController::startup()
 {
-    PacketPtr first_update = createUpdatePacket<uint32_t>(addr, value);
+    Addr initial_addr = params().init_addr;
+    uint32_t initial_value = params().init_value;
+    PacketPtr first_update =
+                createUpdatePacket<uint32_t>(initial_addr, initial_value);
 
     if (!reqPort.blocked()) {
         reqPort.sendPacket(first_update);
@@ -111,6 +112,21 @@ CenteralController::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
+PacketPtr
+CenteralController::createReadPacket(Addr addr, unsigned int size)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, 0);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC((Addr) 0);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
+
 void
 CenteralController::ReqPort::sendPacket(PacketPtr pkt)
 {
@@ -160,6 +176,19 @@ CenteralController::recvDoneSignal()
     }
 
     if (done) {
+        for (Addr addr = 0; addr < maxVertexAddr; addr += system->cacheLineSize()) {
+            PacketPtr pkt = createReadPacket(addr, system->cacheLineSize());
+            reqPort.sendFunctional(pkt);
+
+            int num_items = system->cacheLineSize() / sizeof(WorkListItem);
+            WorkListItem items[num_items];
+            pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
+
+            for (int i = 0; i < num_items; i++) {
+                DPRINTF(FinalAnswer, "%s: WorkListItem[%lu][%d]: %s.\n",
+                                __func__, addr, i, items[i].to_string());
+            }
+        }
         exitSimLoopNow("no update left to process.");
     }
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index c54c4c04ef..bd272cf30d 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -33,6 +33,7 @@
 
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/mpu.hh"
+#include "debug/FinalAnswer.hh"
 #include "params/CenteralController.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
@@ -67,12 +68,12 @@ class CenteralController : public ClockedObject
     System* system;
     ReqPort reqPort;
 
-    Addr addr;
-    uint32_t value;
-
+    Addr maxVertexAddr;
     std::vector<MPU*> mpuVector;
+
     template<typename T> PacketPtr
                               createUpdatePacket(Addr addr, T value);
+    PacketPtr createReadPacket(Addr addr, unsigned int size);
     void functionalAccess(PacketPtr pkt);
 
   public:
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 5f1e849660..59d9720148 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -32,7 +32,6 @@
 
 #include "accl/graph/sega/mpu.hh"
 #include "base/intmath.hh"
-#include "debug/ApplyUpdates.hh"
 #include "debug/CacheBlockState.hh"
 #include "debug/CoalesceEngine.hh"
 #include "debug/SEGAStructureSize.hh"
@@ -75,12 +74,38 @@ CoalesceEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
+void
+CoalesceEngine::recvFunctional(PacketPtr pkt)
+{
+    if (pkt->isRead()) {
+        assert(pkt->getSize() == peerMemoryAtomSize);
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+
+        if ((cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].valid)) {
+            assert(cacheBlocks[block_index].busyMask == 0);
+            assert(!cacheBlocks[block_index].needsApply);
+            // NOTE: No need to check needsWB because there might be entries
+            // that have been updated and not written back in the cache.
+            // assert(!cacheBlocks[block_index].needsWB);
+            assert(!cacheBlocks[block_index].pendingApply);
+            assert(!cacheBlocks[block_index].pendingWB);
+
+            pkt->makeResponse();
+            pkt->setDataFromBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+        } else {
+            memPort.sendFunctional(pkt);
+        }
+    } else {
+        memPort.sendFunctional(pkt);
+    }
+}
+
 bool
 CoalesceEngine::done()
 {
-    bool push_none = needsPush.none();
-    DPRINTF(CoalesceEngine, "%s: needsPush.none: %s.\n",
-                    __func__, push_none ? "true" : "false");
     return applyQueue.empty() && needsPush.none() &&
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
@@ -723,9 +748,6 @@ CoalesceEngine::processNextApplyEvent()
             if (new_prop != current_prop) {
                 cacheBlocks[block_index].items[index].tempProp = new_prop;
                 cacheBlocks[block_index].items[index].prop = new_prop;
-                DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu][%d]: %s.\n",
-                    __func__, cacheBlocks[block_index].addr, index,
-                    cacheBlocks[block_index].items[index].to_string());
 
                 int bit_index_base =
                             getBitIndexBase(cacheBlocks[block_index].addr);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 8190478a1b..bb6fd9d1ea 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -190,6 +190,8 @@ class CoalesceEngine : public BaseMemoryEngine
     CoalesceEngine(const Params &params);
     void registerMPU(MPU* mpu);
 
+    virtual void recvFunctional(PacketPtr pkt);
+
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index a5677067b8..b317992b2d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -142,6 +142,8 @@ class PushEngine : public BaseMemoryEngine
     PushEngine(const Params& params);
     void registerMPU(MPU* mpu);
 
+    virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+
     void start();
     bool running() { return _running; }
     void recvVertexPush(Addr addr, WorkListItem wl);

From 16216bc2bf3dee723fa35eccd478412e47bfe738 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 18 Sep 2022 17:17:24 -0700
Subject: [PATCH 165/247] Adding stats to measure vertexReadLatency.

---
 src/accl/graph/sega/coalesce_engine.cc |  5 ++++-
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/wl_engine.cc       | 14 ++++++++++++--
 src/accl/graph/sega/wl_engine.hh       |  5 +++--
 4 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 59d9720148..d4102a8bca 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -810,6 +810,7 @@ void
 CoalesceEngine::processNextMemoryEvent()
 {
     if (memPort.blocked()) {
+        stats.numMemoryBlocks++;
         nextMemoryEvent.sleep();
         return;
     }
@@ -1097,6 +1098,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache rejections caused by target shortage."),
     ADD_STAT(workSearchFails, statistics::units::Count::get(),
              "Number of times coalesce engine fails to find work to push."),
+    ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
+             "Number of times memory bandwidth was not available."),
     ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
              "Number of times a memory block has been read twice. "
              "Once for push and once to populate the cache."),
@@ -1147,4 +1150,4 @@ CoalesceEngine::CoalesceStats::resetStats()
     lastResetTick = curTick();
 }
 
-}
+} // namespace gem5
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index bb6fd9d1ea..967d83a531 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -165,6 +165,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar mshrEntryShortage;
         statistics::Scalar mshrTargetShortage;
         statistics::Scalar workSearchFails;
+        statistics::Scalar numMemoryBlocks;
         statistics::Scalar numDoubleMemReads;
         statistics::Scalar verticesPulled;
         statistics::Scalar verticesPushed;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index a39905037e..b16d827dbe 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -143,6 +143,7 @@ WLEngine::processNextReadEvent()
                             "updateQueueSize = %d.\n", __func__, update_addr,
                             update_value, updateQueue.size(), updateQueueSize);
                 owner->checkRetryReq();
+                vertexReadTime[update_addr] = curTick();
             }
         } else {
             DPRINTF(WLEngine, "%s: There are no free registers "
@@ -189,6 +190,11 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
     DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to "
                 "workListFile. workListFile.size = %d.\n", __func__, addr,
                                     wl.to_string(), workListFile.size());
+
+    stats.vertexReadLatency.sample(
+        (curTick() - vertexReadTime[addr]) / getClockFrequency());
+    vertexReadTime.erase(addr);
+
     assert(!workListFile.empty());
     if (!nextReduceEvent.scheduled()) {
         schedule(nextReduceEvent, nextCycle());
@@ -238,7 +244,9 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
              "Number of memory blocks read for vertecies"),
     ADD_STAT(registerShortage, statistics::units::Count::get(),
              "Number of times updates were "
-             "stalled because of register shortage")
+             "stalled because of register shortage"),
+    ADD_STAT(vertexReadLatency, statistics::units::Second::get(),
+             "Histogram of the latency of reading a vertex.")
 {
 }
 
@@ -246,6 +254,8 @@ void
 WLEngine::WorkListStats::regStats()
 {
     using namespace statistics;
-}
 
+    vertexReadLatency.init(64);
 }
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 2956e58666..0c6361825e 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -47,13 +47,12 @@ class WLEngine : public BaseReduceEngine
   private:
     MPU* owner;
 
-
-
     int updateQueueSize;
     std::deque<std::tuple<Addr, uint32_t>> updateQueue;
 
     int registerFileSize;
     std::unordered_map<Addr, uint32_t> registerFile;
+    std::unordered_map<Addr, Tick> vertexReadTime;
 
     std::unordered_map<Addr, WorkListItem> workListFile;
 
@@ -77,6 +76,8 @@ class WLEngine : public BaseReduceEngine
       statistics::Scalar numReduce;
       statistics::Scalar registerFileCoalesce;
       statistics::Scalar registerShortage;
+
+      statistics::Histogram vertexReadLatency;
     };
 
     WorkListStats stats;

From 3e6216c8976155517cb9edb2874ca7c890b56255 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 19 Sep 2022 11:56:05 -0700
Subject: [PATCH 166/247] Adding a config script with simple memory

---
 configs/accl/sega-simple.py | 177 ++++++++++++++++++++++++++++++++++++
 configs/accl/sega.py        |  48 ++++++----
 2 files changed, 206 insertions(+), 19 deletions(-)
 create mode 100644 configs/accl/sega-simple.py

diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
new file mode 100644
index 0000000000..ae537e76ca
--- /dev/null
+++ b/configs/accl/sega-simple.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import m5
+import argparse
+
+from math import log
+from m5.objects import *
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+        intlv_low_bit = log(cache_line_size, 2)
+        intlv_bits = log(num_channels, 2)
+        ret = []
+        for i in range(num_channels):
+            ret.append(AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i))
+        return ret
+
+class GPT(SubSystem):
+    def __init__(self, edge_memory_size: str, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+                                update_queue_size=64,
+                                register_file_size=32
+                                )
+        self.coalesce_engine = CoalesceEngine(
+                                            attached_memory_atom_size=32,
+                                            cache_size=cache_size,
+                                            num_mshr_entry=32,
+                                            num_tgts_per_mshr=32,
+                                            max_resp_per_cycle=4
+                                            )
+        self.push_engine = PushEngine(
+                                    push_req_queue_size=32,
+                                    attached_memory_atom_size=64,
+                                    resp_queue_size=64
+                                    )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+                                        latency="75ns",
+                                        latency_var="0ns",
+                                        bandwidth="19.2GB/s"
+                                        )
+
+        self.edge_mem_ctrl = SimpleMemory(
+                                        latency="75ns",
+                                        latency_var="0ns",
+                                        bandwidth="19.2GB/s",
+                                        range=AddrRange(edge_memory_size),
+                                        in_addr_map=False
+                                        )
+
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+        self.push_engine.mem_port = self.edge_mem_ctrl.port
+
+        self.mpu = MPU(
+                    wl_engine=self.wl_engine,
+                    coalesce_engine=self.coalesce_engine,
+                    push_engine=self.push_engine
+                    )
+
+    def getRespPort(self):
+        return self.mpu.in_port
+    def setRespPort(self, port):
+        self.mpu.in_port = port
+
+    def getReqPort(self):
+        return self.mpu.out_port
+    def setReqPort(self, port):
+        self.mpu.out_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+    def set_edge_image(self, edge_image):
+        self.edge_mem_ctrl.image_file = edge_image
+
+class SEGA(System):
+    def __init__(
+                self,
+                num_mpus,
+                cache_size,
+                graph_path,
+                first_addr,
+                first_value
+                ):
+        super(SEGA, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = '1GHz'
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        self.interconnect = NoncoherentXBar(
+                                            frontend_latency=1,
+                                            forward_latency=1,
+                                            response_latency=1,
+                                            width=64
+                                            )
+
+        self.ctrl = CenteralController(
+                                    init_addr=first_addr,
+                                    init_value=first_value,
+                                    image_file=f"{graph_path}/vertices"
+                                    )
+
+        self.ctrl.req_port = self.interconnect.cpu_side_ports
+
+        vertex_ranges = interleave_addresses(
+                                            AddrRange(start=0, size="4GiB"),
+                                            num_mpus,
+                                            32
+                                            )
+
+        gpts = []
+        for i in range(num_mpus):
+            gpt = GPT("8GiB", cache_size)
+            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
+            gpt.setReqPort(self.interconnect.cpu_side_ports)
+            gpt.setRespPort(self.interconnect.mem_side_ports)
+            gpts.append(gpt)
+        self.gpts = gpts
+
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+
+    args = argparser.parse_args()
+
+    return args.num_gpts, args.cache_size, \
+        args.graph, args.init_addr, args.init_value
+
+if __name__ == "__m5_main__":
+    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph, init_addr, init_value)
+    root = Root(full_system = False, system = system)
+
+    m5.instantiate()
+
+    exit_event = m5.simulate()
+    print(f"Exited simulation at tick {m5.curTick()} " + \
+            f"because {exit_event.getCause()}")
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7b37742cdb..8c30d10dec 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -47,29 +47,39 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(update_queue_size=64,
-                                register_file_size=32)
-        self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
+        self.wl_engine = WLEngine(
+                                update_queue_size=64,
+                                register_file_size=32
+                                )
+        self.coalesce_engine = CoalesceEngine(
+                                            attached_memory_atom_size=32,
                                             cache_size=cache_size,
                                             num_mshr_entry=32,
                                             num_tgts_per_mshr=32,
-                                            max_resp_per_cycle=4)
-        self.push_engine = PushEngine(push_req_queue_size=32,
+                                            max_resp_per_cycle=4
+                                            )
+        self.push_engine = PushEngine(
+                                    push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    resp_queue_size=64)
+                                    resp_queue_size=64
+                                    )
 
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
 
         self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
-                                    range=AddrRange(edge_memory_size),
-                                    in_addr_map=False))
+                                            range=AddrRange(edge_memory_size),
+                                            in_addr_map=False
+                                                    )
+                                    )
 
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
 
-        self.mpu = MPU(wl_engine=self.wl_engine,
+        self.mpu = MPU(
+                    wl_engine=self.wl_engine,
                     coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine)
+                    push_engine=self.push_engine
+                    )
 
     def getRespPort(self):
         return self.mpu.in_port
@@ -87,7 +97,8 @@ def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
 
 class SEGA(System):
-    def __init__(self,
+    def __init__(
+                self,
                 num_mpus,
                 cache_size,
                 graph_path,
@@ -101,25 +112,24 @@ def __init__(self,
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        self.interconnect = NoncoherentXBar(frontend_latency=1,
+        self.interconnect = NoncoherentXBar(
+                                            frontend_latency=1,
                                             forward_latency=1,
                                             response_latency=1,
                                             width=64
                                             )
 
         self.ctrl = CenteralController(
-                                    init_addr=first_addr,
-                                    init_value=first_value,
+                                    addr=first_addr, value=first_value,
                                     image_file=f"{graph_path}/vertices"
                                     )
-
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
         vertex_ranges = interleave_addresses(
-                                            AddrRange(start=0, size="4GiB"),
-                                            num_mpus,
-                                            32
-                                            )
+                                        AddrRange(start=0, size="4GiB"),
+                                        num_mpus,
+                                        32
+                                        )
 
         gpts = []
         for i in range(num_mpus):

From e1d8a934fdbb80520c46e18a136a271ac676d255 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 19 Sep 2022 20:27:40 -0700
Subject: [PATCH 167/247] Adding stats to count the result of bitvector search.

---
 src/accl/graph/sega/coalesce_engine.cc | 12 +++++++-----
 src/accl/graph/sega/coalesce_engine.hh |  3 ++-
 src/accl/graph/sega/push_engine.cc     |  3 +++
 src/accl/graph/sega/push_engine.hh     |  1 +
 src/accl/graph/sega/wl_engine.cc       |  2 +-
 5 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index d4102a8bca..b870345d57 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -1031,9 +1031,10 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             pendingVertexPullReads[addr] = send_mask;
         }
         numPullsReceived--;
-    } else {
-        stats.workSearchFails++;
     }
+
+    stats.bitvectorSearchStatus[bit_status]++;
+
     if (numPullsReceived > 0) {
         memoryFunctionQueue.emplace_back(
             [this] (int slice_base, Tick schedule_tick) {
@@ -1096,8 +1097,6 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache rejections caused by entry shortage."),
     ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
              "Number of cache rejections caused by target shortage."),
-    ADD_STAT(workSearchFails, statistics::units::Count::get(),
-             "Number of times coalesce engine fails to find work to push."),
     ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
              "Number of times memory bandwidth was not available."),
     ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
@@ -1111,6 +1110,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Time of the last pull request. (Relative to reset_stats)"),
     ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
              "Time of the last vertex push. (Relative to reset_stats)"),
+    ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
+             "Distribution for the location of vertex searches."),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
     ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
@@ -1122,7 +1123,7 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
              "Histogram on the length of the mshr entries."),
     ADD_STAT(bitvectorLength, statistics::units::Count::get(),
-             "Histogram of the length of the bitvector")
+             "Histogram of the length of the bitvector.")
 {
 }
 
@@ -1133,6 +1134,7 @@ CoalesceEngine::CoalesceStats::regStats()
 
     mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
     bitvectorLength.init(64);
+    bitvectorSearchStatus.init(4);
 
     hitRate = (readHits + readHitUnderMisses) /
                 (readHits + readHitUnderMisses + readMisses);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 967d83a531..2b7b17d196 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -164,7 +164,6 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar readHitUnderMisses;
         statistics::Scalar mshrEntryShortage;
         statistics::Scalar mshrTargetShortage;
-        statistics::Scalar workSearchFails;
         statistics::Scalar numMemoryBlocks;
         statistics::Scalar numDoubleMemReads;
         statistics::Scalar verticesPulled;
@@ -172,6 +171,8 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar lastVertexPullTime;
         statistics::Scalar lastVertexPushTime;
 
+        statistics::Vector bitvectorSearchStatus;
+
         statistics::Formula hitRate;
         statistics::Formula vertexPullBW;
         statistics::Formula vertexPushBW;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 855d666989..a56283cbf6 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -243,6 +243,7 @@ void
 PushEngine::processNextPushEvent()
 {
     if (owner->blocked()) {
+        stats.numNetBlocks++;
         nextPushEvent.sleep();
         return;
     }
@@ -301,6 +302,8 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
     push(_push),
     ADD_STAT(numUpdates, statistics::units::Count::get(),
              "Number of sent updates."),
+    ADD_STAT(numNetBlocks, statistics::units::Count::get(),
+             "Number of updates blocked by network."),
     ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
                                     statistics::units::Second>::get(),
              "Traversed Edges Per Second.")
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index b317992b2d..801d8e567d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -127,6 +127,7 @@ class PushEngine : public BaseMemoryEngine
       PushEngine &push;
 
       statistics::Scalar numUpdates;
+      statistics::Scalar numNetBlocks;
 
       statistics::Formula TEPS;
     };
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index b16d827dbe..c6e8fda523 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -192,7 +192,7 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
                                     wl.to_string(), workListFile.size());
 
     stats.vertexReadLatency.sample(
-        (curTick() - vertexReadTime[addr]) / getClockFrequency());
+        ((curTick() - vertexReadTime[addr]) * 1e9) / getClockFrequency());
     vertexReadTime.erase(addr);
 
     assert(!workListFile.empty());

From a0a0fbeaa85a09ee2545adfaedfc251de483b6fd Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 22 Sep 2022 12:21:46 -0700
Subject: [PATCH 168/247] Adding a stat to count number of idle cycles.

---
 src/accl/graph/sega/push_engine.cc | 6 +++++-
 src/accl/graph/sega/push_engine.hh | 6 +++---
 src/accl/graph/sega/wl_engine.hh   | 1 +
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a56283cbf6..5029013acd 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -28,7 +28,6 @@
 
 #include "accl/graph/sega/push_engine.hh"
 
-#include "accl/graph/sega/coalesce_engine.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "debug/PushEngine.hh"
 #include "mem/packet_access.hh"
@@ -40,6 +39,7 @@ namespace gem5
 PushEngine::PushEngine(const Params& params):
     BaseMemoryEngine(params),
     _running(false),
+    lastIdleEntranceTick(0),
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
     workload(params.workload),
@@ -107,6 +107,7 @@ PushEngine::start()
     assert(!nextVertexPullEvent.scheduled());
 
     _running = true;
+    stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick);
     // NOTE: We might have to check for size availability here.
     assert(workLeft());
     if (vertexSpace()) {
@@ -123,6 +124,7 @@ PushEngine::processNextVertexPullEvent()
 
     if (!workLeft()) {
         _running = false;
+        lastIdleEntranceTick = curTick();
     }
 
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
@@ -304,6 +306,8 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
              "Number of sent updates."),
     ADD_STAT(numNetBlocks, statistics::units::Count::get(),
              "Number of updates blocked by network."),
+    ADD_STAT(numIdleCycles, statistics::units::Count::get(),
+             "Number of cycles PushEngine has been idle."),
     ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
                                     statistics::units::Second>::get(),
              "Traversed Edges Per Second.")
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 801d8e567d..1f139d061e 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -37,7 +37,6 @@
 namespace gem5
 {
 
-class CoalesceEngine;
 class MPU;
 
 class PushEngine : public BaseMemoryEngine
@@ -88,10 +87,10 @@ class PushEngine : public BaseMemoryEngine
         Addr offset;
         int numElements;
     };
+    MPU* owner;
 
     bool _running;
-    int numElementsPerLine;
-    MPU* owner;
+    Tick lastIdleEntranceTick;
 
     int numPendingPulls;
     int edgePointerQueueSize;
@@ -128,6 +127,7 @@ class PushEngine : public BaseMemoryEngine
 
       statistics::Scalar numUpdates;
       statistics::Scalar numNetBlocks;
+      statistics::Scalar numIdleCycles;
 
       statistics::Formula TEPS;
     };
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 0c6361825e..3d527df3cf 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -77,6 +77,7 @@ class WLEngine : public BaseReduceEngine
       statistics::Scalar registerFileCoalesce;
       statistics::Scalar registerShortage;
 
+
       statistics::Histogram vertexReadLatency;
     };
 

From efcc6d2b35a03dcaa078f5c95d91ef6028c7805b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 22 Sep 2022 17:32:46 -0700
Subject: [PATCH 169/247] Adding stats to measure queueing latencies.

---
 configs/accl/sega.py                   |  3 ++-
 src/accl/graph/base/data_structs.hh    |  6 ++++--
 src/accl/graph/sega/coalesce_engine.cc | 17 +++++++++++++++--
 src/accl/graph/sega/coalesce_engine.hh |  5 ++++-
 src/accl/graph/sega/push_engine.cc     | 25 +++++++++++++++++++------
 src/accl/graph/sega/push_engine.hh     | 12 +++++++++---
 src/accl/graph/sega/wl_engine.cc       | 12 +++++++++---
 src/accl/graph/sega/wl_engine.hh       |  4 ++--
 8 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8c30d10dec..a67551a5fd 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -120,7 +120,8 @@ def __init__(
                                             )
 
         self.ctrl = CenteralController(
-                                    addr=first_addr, value=first_value,
+                                    init_addr=first_addr,
+                                    init_value=first_value,
                                     image_file=f"{graph_path}/vertices"
                                     )
         self.ctrl.req_port = self.interconnect.cpu_side_ports
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 026a3cb7b2..a46aaf2de9 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -96,8 +96,10 @@ struct CompleteEdge {
     uint32_t weight;
     uint32_t value;
 
-    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value):
-        src(src), dst(dst), weight(weight), value(value)
+    uint64_t entrance;
+
+    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value, uint64_t entrance):
+        src(src), dst(dst), weight(weight), value(value), entrance(entrance)
     {}
 
     std::string to_string()
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b870345d57..62cae01613 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -826,6 +826,8 @@ CoalesceEngine::processNextMemoryEvent()
         next_memory_function_tick) = memoryFunctionQueue.front();
     next_memory_function(next_memory_function_input, next_memory_function_tick);
     memoryFunctionQueue.pop_front();
+    stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick)
+                                                * 1e9 / getClockFrequency());
     DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
                                 "memoryFunctionQueue.size = %d.\n", __func__,
                                 memoryFunctionQueue.size());
@@ -929,6 +931,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                             "the current write back scheduled at tick %lu for "
                             "the right function scheduled later.\n",
                             __func__, block_index, schedule_tick);
+        stats.numInvalidMemFunctions++;
     }
 }
 
@@ -1110,6 +1113,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Time of the last pull request. (Relative to reset_stats)"),
     ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
              "Time of the last vertex push. (Relative to reset_stats)"),
+    ADD_STAT(numInvalidMemFunctions, statistics::units::Count::get(),
+             "Number of times a scheduled memory function has been invalid."),
     ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
              "Distribution for the location of vertex searches."),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
@@ -1123,7 +1128,9 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
              "Histogram on the length of the mshr entries."),
     ADD_STAT(bitvectorLength, statistics::units::Count::get(),
-             "Histogram of the length of the bitvector.")
+             "Histogram of the length of the bitvector."),
+    ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
+             "Histogram of the latency of processing a memory function.")
 {
 }
 
@@ -1134,7 +1141,11 @@ CoalesceEngine::CoalesceStats::regStats()
 
     mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
     bitvectorLength.init(64);
-    bitvectorSearchStatus.init(4);
+    bitvectorSearchStatus.init(NUM_STATUS);
+    bitvectorSearchStatus.subname(0, "PENDING_READ");
+    bitvectorSearchStatus.subname(1, "IN_CACHE");
+    bitvectorSearchStatus.subname(2, "IN_MEMORY");
+    bitvectorSearchStatus.subname(3, "GARBAGE");
 
     hitRate = (readHits + readHitUnderMisses) /
                 (readHits + readHitUnderMisses + readMisses);
@@ -1142,6 +1153,8 @@ CoalesceEngine::CoalesceStats::regStats()
     vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime;
 
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
+
+    memoryFunctionLatency.init(64);
 }
 
 void
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 2b7b17d196..262f75fbcf 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -47,7 +47,8 @@ enum BitStatus
     PENDING_READ,
     IN_CACHE,
     IN_MEMORY,
-    GARBAGE
+    GARBAGE,
+    NUM_STATUS
 };
 
 class MPU;
@@ -170,6 +171,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar verticesPushed;
         statistics::Scalar lastVertexPullTime;
         statistics::Scalar lastVertexPushTime;
+        statistics::Scalar numInvalidMemFunctions;
 
         statistics::Vector bitvectorSearchStatus;
 
@@ -179,6 +181,7 @@ class CoalesceEngine : public BaseMemoryEngine
 
         statistics::Histogram mshrEntryLength;
         statistics::Histogram bitvectorLength;
+        statistics::Histogram memoryFunctionLatency;
     };
 
     CoalesceStats stats;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 5029013acd..af1c904eda 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -142,8 +142,10 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
 
-    edgePointerQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                        peerMemoryAtomSize, addr, (uint32_t) wl.prop);
+    edgePointerQueue.emplace_back(
+                            start_addr, end_addr, sizeof(Edge),
+                            peerMemoryAtomSize, addr,
+                            (uint32_t) wl.prop, curTick());
     numPendingPulls--;
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
@@ -182,6 +184,9 @@ PushEngine::processNextMemoryReadEvent()
 
         if (curr_info.done()) {
             DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
+            stats.edgePointerQueueLatency.sample(
+                                (curTick() - curr_info.entrance()) *
+                                1e9 / getClockFrequency());
             edgePointerQueue.pop_front();
             DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. "
             "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size());
@@ -224,8 +229,8 @@ PushEngine::handleMemResp(PacketPtr pkt)
         Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
         Addr edge_dst = edge->neighbor;
         uint32_t edge_weight = edge->weight;
-        edges.emplace_back(push_info.src, edge_dst,
-                    edge_weight, push_info.value);
+        edges.emplace_back(
+            push_info.src, edge_dst, edge_weight, push_info.value, curTick());
     }
     edgeQueue.push_back(edges);
     onTheFlyMemReqs--;
@@ -267,7 +272,8 @@ PushEngine::processNextPushEvent()
                         "with value: %d.\n", __func__, curr_edge.src,
                         curr_edge.dst, update_value);
 
-
+    stats.edgeQueueLatency.sample(
+        (curTick() - curr_edge.entrance) * 1e9 / getClockFrequency());
     edge_list.pop_front();
     if (edge_list.empty()) {
         edgeQueue.pop_front();
@@ -310,7 +316,11 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
              "Number of cycles PushEngine has been idle."),
     ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
                                     statistics::units::Second>::get(),
-             "Traversed Edges Per Second.")
+             "Traversed Edges Per Second."),
+    ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of the edgePointerQueue."),
+    ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of the edgeQueue.")
 {
 }
 
@@ -320,6 +330,9 @@ PushEngine::PushStats::regStats()
     using namespace statistics;
 
     TEPS = numUpdates / simSeconds;
+
+    edgePointerQueueLatency.init(64);
+    edgeQueueLatency.init(64);
 }
 
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 1f139d061e..5d2277eb5a 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -52,11 +52,12 @@ class PushEngine : public BaseMemoryEngine
         Addr _src;
         uint32_t _value;
 
+        Tick _entrance;
       public:
         EdgeReadInfoGen(Addr start, Addr end, size_t step,
-                            size_t atom, Addr src, uint32_t value):
-                            _start(start), _end(end), _step(step),
-                            _atom(atom), _src(src), _value(value)
+                        size_t atom, Addr src, uint32_t value, Tick entrance):
+                        _start(start), _end(end), _step(step), _atom(atom),
+                        _src(src), _value(value), _entrance(entrance)
         {}
 
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
@@ -80,6 +81,8 @@ class PushEngine : public BaseMemoryEngine
 
         Addr src() { return _src; }
         uint32_t value() { return _value; }
+
+        Tick entrance() { return _entrance; }
     };
     struct PushInfo {
         Addr src;
@@ -130,6 +133,9 @@ class PushEngine : public BaseMemoryEngine
       statistics::Scalar numIdleCycles;
 
       statistics::Formula TEPS;
+
+      statistics::Histogram edgePointerQueueLatency;
+      statistics::Histogram edgeQueueLatency;
     };
 
     PushStats stats;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index c6e8fda523..5d4dd1723e 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -79,7 +79,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
         return false;
     }
 
-    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
+    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>(), curTick());
     DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
                 "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
                 __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
@@ -105,7 +105,8 @@ WLEngine::processNextReadEvent()
 {
     Addr update_addr;
     uint32_t update_value;
-    std::tie(update_addr, update_value) = updateQueue.front();
+    Tick enter_tick;
+    std::tie(update_addr, update_value, enter_tick) = updateQueue.front();
 
     DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
             "(addr: %lu, value: %u).\n", __func__, update_addr, update_value);
@@ -134,6 +135,7 @@ WLEngine::processNextReadEvent()
                         "registerFileSize = %d.\n", __func__, update_addr,
                         update_value, registerFile.size(), registerFileSize);
                 updateQueue.pop_front();
+                stats.updateQueueLatency.sample((curTick() - enter_tick) * 1e9 / getClockFrequency());
                 DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
@@ -162,6 +164,7 @@ WLEngine::processNextReadEvent()
                     update_value, update_addr, registerFile[update_addr]);
         stats.registerFileCoalesce++;
         updateQueue.pop_front();
+        stats.updateQueueLatency.sample((curTick() - enter_tick) * 1e9 / getClockFrequency());
         DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
@@ -246,7 +249,9 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
              "Number of times updates were "
              "stalled because of register shortage"),
     ADD_STAT(vertexReadLatency, statistics::units::Second::get(),
-             "Histogram of the latency of reading a vertex.")
+             "Histogram of the latency of reading a vertex (ns)."),
+    ADD_STAT(updateQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of dequeuing an update (ns).")
 {
 }
 
@@ -256,6 +261,7 @@ WLEngine::WorkListStats::regStats()
     using namespace statistics;
 
     vertexReadLatency.init(64);
+    updateQueueLatency.init(64);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 3d527df3cf..f888979be9 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -48,7 +48,7 @@ class WLEngine : public BaseReduceEngine
     MPU* owner;
 
     int updateQueueSize;
-    std::deque<std::tuple<Addr, uint32_t>> updateQueue;
+    std::deque<std::tuple<Addr, uint32_t, Tick>> updateQueue;
 
     int registerFileSize;
     std::unordered_map<Addr, uint32_t> registerFile;
@@ -77,8 +77,8 @@ class WLEngine : public BaseReduceEngine
       statistics::Scalar registerFileCoalesce;
       statistics::Scalar registerShortage;
 
-
       statistics::Histogram vertexReadLatency;
+      statistics::Histogram updateQueueLatency;
     };
 
     WorkListStats stats;

From baa1dcb8df2e4d09a05ed6b97fd1b36c24f92e74 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 26 Sep 2022 10:52:35 -0700
Subject: [PATCH 170/247] Added pybindmethod to createInitialUpdate. merge
 added.

---
 configs/accl/sega-simple.py                |   8 +-
 configs/accl/sega-single-simple.py         | 151 ++++++++++++++++++++
 configs/accl/sega-single.py                | 155 +++++++++++++++++++++
 src/accl/graph/sega/CenteralController.py  |   8 +-
 src/accl/graph/sega/MPU.py                 |   1 +
 src/accl/graph/sega/base_memory_engine.cc  |  20 +--
 src/accl/graph/sega/centeral_controller.cc | 131 +++++------------
 src/accl/graph/sega/centeral_controller.hh |  39 ++----
 src/accl/graph/sega/coalesce_engine.cc     |  27 ++++
 src/base/addr_range.hh                     |  31 +++++
 10 files changed, 430 insertions(+), 141 deletions(-)
 create mode 100644 configs/accl/sega-single-simple.py
 create mode 100644 configs/accl/sega-single.py

diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
index ae537e76ca..e0a4fcc89e 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega-simple.py
@@ -65,15 +65,15 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                     )
 
         self.vertex_mem_ctrl = SimpleMemory(
-                                        latency="75ns",
+                                        latency="0ns",
                                         latency_var="0ns",
-                                        bandwidth="19.2GB/s"
+                                        bandwidth="0GB/s"
                                         )
 
         self.edge_mem_ctrl = SimpleMemory(
-                                        latency="75ns",
+                                        latency="30ns",
                                         latency_var="0ns",
-                                        bandwidth="19.2GB/s",
+                                        bandwidth="32GB/s",
                                         range=AddrRange(edge_memory_size),
                                         in_addr_map=False
                                         )
diff --git a/configs/accl/sega-single-simple.py b/configs/accl/sega-single-simple.py
new file mode 100644
index 0000000000..a87e6c53bb
--- /dev/null
+++ b/configs/accl/sega-single-simple.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import m5
+import argparse
+
+from math import log
+from m5.objects import *
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+        intlv_low_bit = log(cache_line_size, 2)
+        intlv_bits = log(num_channels, 2)
+        ret = []
+        for i in range(num_channels):
+            ret.append(AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i))
+        return ret
+
+class GPT(SubSystem):
+    def __init__(self, edge_memory_size: str, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+                                update_queue_size=64,
+                                register_file_size=32
+                                )
+        self.coalesce_engine = CoalesceEngine(
+                                            attached_memory_atom_size=32,
+                                            cache_size=cache_size,
+                                            num_mshr_entry=32,
+                                            num_tgts_per_mshr=32,
+                                            max_resp_per_cycle=4
+                                            )
+        self.push_engine = PushEngine(
+                                    push_req_queue_size=32,
+                                    attached_memory_atom_size=64,
+                                    resp_queue_size=64
+                                    )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+                                        latency="30ns",
+                                        latency_var="0ns",
+                                        bandwidth="0GB/s"
+                                        )
+
+        self.edge_mem_ctrl = SimpleMemory(
+                                        latency="30ns",
+                                        latency_var="0ns",
+                                        bandwidth="32GB/s",
+                                        range=AddrRange(edge_memory_size),
+                                        in_addr_map=False
+                                        )
+
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+        self.push_engine.mem_port = self.edge_mem_ctrl.port
+
+        self.mpu = MPU(
+                    wl_engine=self.wl_engine,
+                    coalesce_engine=self.coalesce_engine,
+                    push_engine=self.push_engine
+                    )
+
+    def getRespPort(self):
+        return self.mpu.in_port
+    def setRespPort(self, port):
+        self.mpu.in_port = port
+
+    def getReqPort(self):
+        return self.mpu.out_port
+    def setReqPort(self, port):
+        self.mpu.out_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+
+    def set_vertex_image(self, vertex_image):
+        self.vertex_mem_ctrl.image_file = vertex_image
+    def set_edge_image(self, edge_image):
+        self.edge_mem_ctrl.image_file = edge_image
+
+class SEGA(System):
+    def __init__(self, cache_size, graph_path):
+        super(SEGA, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = '2GHz'
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        gpts = [GPT("8GiB", cache_size)]
+        gpts[0].set_vertex_range(AddrRange("4GiB"))
+        gpts[0].set_edge_image(f"{graph_path}/edgelist_0")
+        gpts[0].setReqPort(gpts[0].getRespPort())
+        self.gpts = gpts
+
+        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+    def create_initial_bfs_update(self, init_addr, init_value):
+        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+
+    args = argparser.parse_args()
+
+    return args.cache_size, args.graph, args.init_addr, args.init_value
+
+if __name__ == "__m5_main__":
+    cache_size, graph, init_addr, init_value = get_inputs()
+
+    system = SEGA(cache_size, graph)
+    root = Root(full_system = False, system = system)
+
+    m5.instantiate()
+
+    system.create_initial_bfs_update(init_addr, init_value)
+    exit_event = m5.simulate()
+    print(f"Exited simulation at tick {m5.curTick()} " + \
+            f"because {exit_event.getCause()}")
diff --git a/configs/accl/sega-single.py b/configs/accl/sega-single.py
new file mode 100644
index 0000000000..d9fe11a781
--- /dev/null
+++ b/configs/accl/sega-single.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import m5
+import argparse
+
+from math import log
+from m5.objects import *
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+        intlv_low_bit = log(cache_line_size, 2)
+        intlv_bits = log(num_channels, 2)
+        ret = []
+        for i in range(num_channels):
+            ret.append(AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i))
+        return ret
+
+class GPT(SubSystem):
+    def __init__(self, edge_memory_size: str, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+                                update_queue_size=64,
+                                register_file_size=32
+                                )
+        self.coalesce_engine = CoalesceEngine(
+                                            attached_memory_atom_size=32,
+                                            cache_size=cache_size,
+                                            num_mshr_entry=32,
+                                            num_tgts_per_mshr=32,
+                                            max_resp_per_cycle=4
+                                            )
+        self.push_engine = PushEngine(
+                                    push_req_queue_size=32,
+                                    attached_memory_atom_size=64,
+                                    resp_queue_size=64
+                                    )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+                                            latency="30ns",
+                                            latency_var="0ns",
+                                            bandwidth="32GiB/s"
+                                        )
+
+        self.edge_mem_ctrl = MemCtrl(
+                                    dram=DDR4_2400_8x8(
+                                        range=AddrRange(edge_memory_size),
+                                        in_addr_map=False
+                                    )
+                                )
+
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+        self.push_engine.mem_port = self.edge_mem_ctrl.port
+
+        self.mpu = MPU(
+                    wl_engine=self.wl_engine,
+                    coalesce_engine=self.coalesce_engine,
+                    push_engine=self.push_engine
+                    )
+
+    def getRespPort(self):
+        return self.mpu.in_port
+    def setRespPort(self, port):
+        self.mpu.in_port = port
+
+    def getReqPort(self):
+        return self.mpu.out_port
+    def setReqPort(self, port):
+        self.mpu.out_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+
+    def set_edge_image(self, edge_image):
+        self.edge_mem_ctrl.dram.image_file = edge_image
+
+class SEGA(System):
+    def __init__(self, cache_size, graph_path):
+        super(SEGA, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = '2GHz'
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        plain_vertex_range = AddrRange("4GiB")
+        self._vertex_ranges = interleave_addresses(
+                                            plain_vertex_range,
+                                            1,
+                                            32
+                                            )
+
+        gpts = [GPT("8GiB", cache_size)]
+        gpts[0].set_vertex_ranges(self._vertex_ranges[0])
+        gpts[0].set_edge_image(f"{graph_path}/edgelist_0")
+        gpts[0].setReqPort(gpts[0].getRespPort())
+        self.gpts = gpts
+
+        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+    def create_initial_bfs_update(self, init_addr, init_value):
+        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+
+    args = argparser.parse_args()
+
+    return args.cache_size, args.graph, args.init_addr, args.init_value
+
+if __name__ == "__m5_main__":
+    cache_size, graph, init_addr, init_value = get_inputs()
+
+    system = SEGA(cache_size, graph)
+    root = Root(full_system = False, system = system)
+
+    m5.instantiate()
+
+    system.create_initial_bfs_update(init_addr, init_value)
+    exit_event = m5.simulate()
+    print(f"Exited simulation at tick {m5.curTick()} " + \
+            f"because {exit_event.getCause()}")
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 9bee76511d..0721ff977c 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -27,6 +27,7 @@
 
 from m5.params import *
 from m5.proxy import *
+from m5.util.pybind import PyBindMethod
 from m5.objects.ClockedObject import ClockedObject
 
 class CenteralController(ClockedObject):
@@ -35,10 +36,9 @@ class CenteralController(ClockedObject):
     cxx_class = 'gem5::CenteralController'
 
     system = Param.System(Parent.any, "System this Engine is a part of")
-    req_port  = RequestPort("Port to send updates to the outside")
+
+    image_file = Param.String("Path to the vertex image file.")
 
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
-    init_addr = Param.Addr("The addr for the initial update")
-    init_value = Param.Int("The value for the initial update")
-    image_file = Param.String("Path to the global memory image.")
+    cxx_exports = [PyBindMethod("createInitialBFSUpdate")]
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 2d65be2949..d80142b21e 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -45,3 +45,4 @@ class MPU(SimObject):
                                 "each instance of MPU object.")
     push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
                                 "instance of MPU object.")
+
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index 9bd1941b23..d9864664b1 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -59,14 +59,18 @@ void
 BaseMemoryEngine::init()
 {
     AddrRangeList memory_ranges = memPort.getAddrRanges();
-    // BaseMemoryEngine only supports one memory.
-    assert(memory_ranges.size() == 1);
-
-    peerMemoryRange = memory_ranges.front();
-    DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is %s. "
-                            "The range is %s interleaved.\n", __func__,
-                            peerMemoryRange.to_string(),
-                            peerMemoryRange.interleaved() ? "" : "not");
+
+    if (memory_ranges.size() == 2) {
+        peerMemoryRange = merge(memory_ranges.front(), memory_ranges.back());
+    } else if (memory_ranges.size() == 1) {
+        peerMemoryRange = memory_ranges.front();
+    } else {
+        panic("Received an unacceptable number of ranges from memory.");
+    }
+    DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is "
+            "%s. The range is %s interleaved.\n", __func__,
+            peerMemoryRange.to_string(),
+            peerMemoryRange.interleaved() ? "" : "not");
 }
 
 void
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index c6de1d8390..68b88e9e77 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -37,12 +37,9 @@
 namespace gem5
 {
 
-CenteralController::CenteralController
-                    (const CenteralControllerParams &params):
+CenteralController::CenteralController(const Params& params):
     ClockedObject(params),
-    system(params.system),
-    reqPort(name() + ".req_port", this),
-    maxVertexAddr(0)
+    system(params.system)
 {
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
@@ -50,33 +47,35 @@ CenteralController::CenteralController
     }
 }
 
-Port&
-CenteralController::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "req_port") {
-        return reqPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
-}
-
 void
 CenteralController::initState()
 {
-    // ClockedObject::initState();
-
+    for (auto mpu: mpuVector) {
+        addrRangeListMap[mpu] = mpu->getAddrRanges();
+    }
     const auto& file = params().image_file;
     if (file == "")
         return;
 
-    auto *object = loader::createObjectFile(file, true);
+    auto* object = loader::createObjectFile(file, true);
     fatal_if(!object, "%s: Could not load %s.", name(), file);
 
     loader::debugSymbolTable.insert(*object->symtab().globals());
     loader::MemoryImage image = object->buildImage();
-    maxVertexAddr = image.maxAddr();
-    PortProxy proxy([this](PacketPtr pkt) { functionalAccess(pkt); },
-                    system->cacheLineSize());
+    Addr maxVertexAddr = image.maxAddr();
+
+    PortProxy proxy(
+    [this](PacketPtr pkt) {
+        for (auto mpu: mpuVector) {
+            AddrRangeList range_list = addrRangeListMap[mpu];
+            for (auto range: range_list) {
+                if (range.contains(pkt->getAddr())) {
+                    mpu->recvFunctional(pkt);
+                    break;
+                }
+            }
+        }
+    }, system->cacheLineSize());
 
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 }
@@ -84,21 +83,24 @@ CenteralController::initState()
 void
 CenteralController::startup()
 {
-    Addr initial_addr = params().init_addr;
-    uint32_t initial_value = params().init_value;
-    PacketPtr first_update =
-                createUpdatePacket<uint32_t>(initial_addr, initial_value);
-
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(first_update);
+    while(!initialUpdates.empty()) {
+        PacketPtr front = initialUpdates.front();
+        for (auto mpu: mpuVector) {
+            AddrRangeList range_list = addrRangeListMap[mpu];
+            for (auto range: range_list) {
+                if (range.contains(front->getAddr())) {
+                    mpu->handleIncomingUpdate(front);
+                }
+            }
+        }
+        initialUpdates.pop_front();
     }
 }
 
 template<typename T> PacketPtr
 CenteralController::createUpdatePacket(Addr addr, T value)
 {
-    RequestPtr req = std::make_shared<Request>(
-                addr, sizeof(T), addr, value);
+    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), addr, value);
     // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
     // bits
     req->setPC(((Addr) value) << 2);
@@ -106,65 +108,17 @@ CenteralController::createUpdatePacket(Addr addr, T value)
     PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
 
     pkt->allocate();
-    // pkt->setData(data);
-    pkt->setLE<T>(value);
-
-    return pkt;
-}
 
-PacketPtr
-CenteralController::createReadPacket(Addr addr, unsigned int size)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, 0);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC((Addr) 0);
-
-    // Embed it in a packet
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-    pkt->allocate();
+    pkt->setLE<T>(value);
 
     return pkt;
 }
 
 void
-CenteralController::ReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-CenteralController::ReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-CenteralController::ReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!_blocked) {
-        blockedPacket = nullptr;
-    }
-}
-
-void
-CenteralController::functionalAccess(PacketPtr pkt)
+CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value)
 {
-    DPRINTF(CenteralController,
-                "%s: Functional access for pkt->addr: %lu, pkt->size: %lu.\n",
-                __func__, pkt->getAddr(), pkt->getSize());
-    reqPort.sendFunctional(pkt);
+    PacketPtr update = createUpdatePacket<uint32_t>(init_addr, init_value);
+    initialUpdates.push_back(update);
 }
 
 void
@@ -176,19 +130,6 @@ CenteralController::recvDoneSignal()
     }
 
     if (done) {
-        for (Addr addr = 0; addr < maxVertexAddr; addr += system->cacheLineSize()) {
-            PacketPtr pkt = createReadPacket(addr, system->cacheLineSize());
-            reqPort.sendFunctional(pkt);
-
-            int num_items = system->cacheLineSize() / sizeof(WorkListItem);
-            WorkListItem items[num_items];
-            pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
-
-            for (int i = 0; i < num_items; i++) {
-                DPRINTF(FinalAnswer, "%s: WorkListItem[%lu][%d]: %s.\n",
-                                __func__, addr, i, items[i].to_string());
-            }
-        }
         exitSimLoopNow("no update left to process.");
     }
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index bd272cf30d..4a4e9c7cb1 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -33,6 +33,7 @@
 
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/mpu.hh"
+#include "base/addr_range.hh"
 #include "debug/FinalAnswer.hh"
 #include "params/CenteralController.hh"
 #include "sim/clocked_object.hh"
@@ -44,46 +45,24 @@ namespace gem5
 class CenteralController : public ClockedObject
 {
   private:
-    class ReqPort : public RequestPort
-    {
-      private:
-        CenteralController* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        ReqPort(const std::string& name, CenteralController* owner) :
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
     System* system;
-    ReqPort reqPort;
 
     Addr maxVertexAddr;
+    std::deque<PacketPtr> initialUpdates;
+
     std::vector<MPU*> mpuVector;
+    std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
 
-    template<typename T> PacketPtr
-                              createUpdatePacket(Addr addr, T value);
-    PacketPtr createReadPacket(Addr addr, unsigned int size);
-    void functionalAccess(PacketPtr pkt);
+    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
   public:
     PARAMS(CenteralController);
     CenteralController(const CenteralControllerParams &params);
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-    virtual void initState();
-    virtual void startup();
 
+    virtual void initState() override;
+    virtual void startup() override;
+
+    void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
     void recvDoneSignal();
 };
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 62cae01613..ac62254fd6 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -127,6 +127,15 @@ int
 CoalesceEngine::getBlockIndex(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
+    // bool found = false;
+    // Addr trimmed_addr;
+    // for (auto range: peerMemoryRanges) {
+    //     if (range.contains(addr)) {
+    //         trimmed_addr = range.removeIntlvBits(addr);
+    //         found = true;
+    //     }
+    // }
+    // assert(found);
     Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
     return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
 }
@@ -136,6 +145,15 @@ int
 CoalesceEngine::getBitIndexBase(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
+    // bool found = false;
+    // Addr trimmed_addr;
+    // for (auto range: peerMemoryRanges) {
+    //     if (range.contains(addr)) {
+    //         trimmed_addr = range.removeIntlvBits(addr);
+    //         found = true;
+    //     }
+    // }
+    // assert(found);
     Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
     int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
     int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
@@ -147,7 +165,16 @@ Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
 {
     assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
+    // bool found = false;
     Addr trimmed_addr = index * sizeof(WorkListItem);
+    // Addr upgraded_addr;
+    // for (auto range: peerMemoryRanges) {
+    //     if (range.contains(trimmed_addr)) {
+    //         upgraded_addr = range.addIntlvBits(trimmed_addr);
+    //         found = true;
+    //     }
+    // }
+    // assert(found);
     return peerMemoryRange.addIntlvBits(trimmed_addr);
 }
 
diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh
index 07bd255d26..a4bf581224 100644
--- a/src/base/addr_range.hh
+++ b/src/base/addr_range.hh
@@ -48,6 +48,7 @@
 
 #include "base/bitfield.hh"
 #include "base/cprintf.hh"
+#include "base/intmath.hh"
 #include "base/logging.hh"
 #include "base/types.hh"
 
@@ -732,6 +733,36 @@ class AddrRange
     {
         return !(*this == r);
     }
+
+    friend AddrRange
+    merge(const AddrRange& left, const AddrRange& right)
+    {
+        assert(left.interleaved());
+        assert(right.interleaved());
+        assert(left.mergesWith(right));
+
+        int bits_org = left.masks.size();
+        int bits_new = bits_org - 1;
+
+        int left_match = left.intlvMatch;
+        int right_match = right.intlvMatch;
+        assert(std::abs(left_match - right_match) == (1 << bits_new));
+
+        Addr last_mask = left.masks[left.masks.size() - 1];
+        int xor_high_bit_org = 0;
+        int xor_high_bit_new = 0;
+        if (!isPowerOf2(last_mask)) {
+            xor_high_bit_org = ceilLog2<Addr>(last_mask);
+            xor_high_bit_new = xor_high_bit_org - 2;
+        }
+        int intlv_high_bit_org =
+                        ceilLog2<Addr>(last_mask ^ (1 << xor_high_bit_org));
+        int intlv_high_bit_new = intlv_high_bit_org - 2;
+
+        int match = std::min(left_match, right_match);
+        return AddrRange(left._start, left._end, intlv_high_bit_new,
+                            xor_high_bit_new, bits_new, match);
+    }
 };
 
 static inline AddrRangeList

From a0461dea5bdcbf67dd89752790902f5e68e070fd Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 26 Sep 2022 12:07:38 -0700
Subject: [PATCH 171/247] Adding stat to measure response latency.

---
 configs/accl/sega-simple.py            |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 24 +++++++++++++++++++-----
 src/accl/graph/sega/coalesce_engine.hh |  2 ++
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
index e0a4fcc89e..fffc273ee1 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega-simple.py
@@ -113,7 +113,7 @@ def __init__(
                 ):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '1GHz'
+        self.clk_domain.clock = '4GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index ac62254fd6..43d352da30 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -274,6 +274,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                 "for cacheBlocks[%d].\n", __func__, addr, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
+        stats.numVertexReads++;
         return true;
     } else {
         // miss
@@ -618,9 +619,16 @@ CoalesceEngine::processNextResponseEvent()
         DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
                     "responseQueue.size = %d.\n", __func__,
                     responseQueue.size());
-        if ((num_responses_sent >= maxRespPerCycle) ||
-            (responseQueue.empty())) {
-                break;
+        stats.responseQueueLatency.sample(
+                                    waiting_ticks * 1e9 / getClockFrequency());
+        if (num_responses_sent >= maxRespPerCycle) {
+            if (!responseQueue.empty()) {
+                stats.responsePortShortage++;
+            }
+            break;
+        }
+        if (responseQueue.empty()) {
+            break;
         }
     }
 
@@ -1127,6 +1135,9 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache rejections caused by entry shortage."),
     ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
              "Number of cache rejections caused by target shortage."),
+    ADD_STAT(responsePortShortage, statistics::units::Count::get(),
+             "Number of times a response has been "
+             "delayed because of port shortage. "),
     ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
              "Number of times memory bandwidth was not available."),
     ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
@@ -1156,6 +1167,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Histogram on the length of the mshr entries."),
     ADD_STAT(bitvectorLength, statistics::units::Count::get(),
              "Histogram of the length of the bitvector."),
+    ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
+             "Histogram of the response latency to WLEngine. (ns)"),
     ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
              "Histogram of the latency of processing a memory function.")
 {
@@ -1166,8 +1179,6 @@ CoalesceEngine::CoalesceStats::regStats()
 {
     using namespace statistics;
 
-    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
-    bitvectorLength.init(64);
     bitvectorSearchStatus.init(NUM_STATUS);
     bitvectorSearchStatus.subname(0, "PENDING_READ");
     bitvectorSearchStatus.subname(1, "IN_CACHE");
@@ -1181,6 +1192,9 @@ CoalesceEngine::CoalesceStats::regStats()
 
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
 
+    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
+    bitvectorLength.init(64);
+    responseQueueLatency.init(64);
     memoryFunctionLatency.init(64);
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 262f75fbcf..705285ba23 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -165,6 +165,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar readHitUnderMisses;
         statistics::Scalar mshrEntryShortage;
         statistics::Scalar mshrTargetShortage;
+        statistics::Scalar responsePortShortage;
         statistics::Scalar numMemoryBlocks;
         statistics::Scalar numDoubleMemReads;
         statistics::Scalar verticesPulled;
@@ -181,6 +182,7 @@ class CoalesceEngine : public BaseMemoryEngine
 
         statistics::Histogram mshrEntryLength;
         statistics::Histogram bitvectorLength;
+        statistics::Histogram responseQueueLatency;
         statistics::Histogram memoryFunctionLatency;
     };
 

From fbbd888e40e6e23b61331aee037a1ebc1a71e695 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 26 Sep 2022 17:01:13 -0700
Subject: [PATCH 172/247] Adding stats to count model inaccuracies.

---
 src/accl/graph/sega/coalesce_engine.cc | 9 +++++++--
 src/accl/graph/sega/coalesce_engine.hh | 3 ++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 43d352da30..0a4a041176 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -828,6 +828,8 @@ CoalesceEngine::processNextApplyEvent()
         }
         DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
+    } else {
+        stats.numInvalidApplies++;
     }
 
     applyQueue.pop_front();
@@ -966,7 +968,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                             "the current write back scheduled at tick %lu for "
                             "the right function scheduled later.\n",
                             __func__, block_index, schedule_tick);
-        stats.numInvalidMemFunctions++;
+        stats.numInvalidWriteBacks++;
     }
 }
 
@@ -1151,7 +1153,10 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Time of the last pull request. (Relative to reset_stats)"),
     ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
              "Time of the last vertex push. (Relative to reset_stats)"),
-    ADD_STAT(numInvalidMemFunctions, statistics::units::Count::get(),
+    ADD_STAT(numInvalidApplies, statistics::units::Count::get(),
+             "Number of times a line has become busy"
+             " while waiting to be applied."),
+    ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(),
              "Number of times a scheduled memory function has been invalid."),
     ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
              "Distribution for the location of vertex searches."),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 705285ba23..b1f5b1fea1 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -172,7 +172,8 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar verticesPushed;
         statistics::Scalar lastVertexPullTime;
         statistics::Scalar lastVertexPushTime;
-        statistics::Scalar numInvalidMemFunctions;
+        statistics::Scalar numInvalidApplies;
+        statistics::Scalar numInvalidWriteBacks;
 
         statistics::Vector bitvectorSearchStatus;
 

From 411bfa11be14dda13cc38351c2efeab4737503da Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 29 Sep 2022 15:11:01 -0700
Subject: [PATCH 173/247] style fix.

---
 src/accl/graph/sega/push_engine.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index af1c904eda..6ff1f77c45 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -335,4 +335,4 @@ PushEngine::PushStats::regStats()
     edgeQueueLatency.init(64);
 }
 
-}
+} // namespace gem5

From bf9bed1ca66b949bba7d03001f34fb6ed30c97b2 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 28 Sep 2022 12:37:13 -0700
Subject: [PATCH 174/247] Adding multiple queues and ports in pushEngine

---
 src/accl/graph/base/data_structs.hh | 24 +++++++-
 src/accl/graph/sega/MPU.py          |  8 ++-
 src/accl/graph/sega/mpu.cc          | 90 ++++++++++++++++++++++++++++-
 src/accl/graph/sega/mpu.hh          | 15 ++++-
 src/accl/graph/sega/push_engine.cc  | 12 +++-
 src/accl/graph/sega/push_engine.hh  |  2 +-
 6 files changed, 137 insertions(+), 14 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index a46aaf2de9..d3db3edda5 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -90,7 +90,7 @@ struct __attribute__ ((packed)) Edge
 static_assert(isPowerOf2(sizeof(WorkListItem)));
 static_assert(isPowerOf2(sizeof(Edge)));
 
-struct CompleteEdge {
+struct MetaEdge {
     uint64_t src;
     uint64_t dst;
     uint32_t weight;
@@ -98,17 +98,35 @@ struct CompleteEdge {
 
     uint64_t entrance;
 
-    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value, uint64_t entrance):
+    MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value, uint64_t entrance):
         src(src), dst(dst), weight(weight), value(value), entrance(entrance)
     {}
 
     std::string to_string()
     {
-        return csprintf("CompleteEdge{src: %lu, dst:%lu, weight: %u}",
+        return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u}",
                                                     src, dst, weight);
     }
 };
 
+struct Update {
+    uint64_t src;
+    uint64_t dst;
+    uint32_t value;
+
+    Update(): src(0), dst(0), value(0)
+    {}
+    Update(uint64_t src, uint64_t dst, uint32_t value):
+        src(src), dst(dst), value(value)
+    {}
+
+    std::string to_string()
+    {
+        return csprintf("Update{src: %lu, dst:%lu, value: %u}",
+                                                src, dst, value);
+    }
+};
+
 template<typename T>
 class UniqueFIFO
 {
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index d80142b21e..1ea6a868a9 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -27,9 +27,9 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.SimObject import SimObject
+from m5.objects.ClockedObject import ClockedObject
 
-class MPU(SimObject):
+class MPU(ClockedObject):
     type = "MPU"
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = "gem5::MPU"
@@ -39,6 +39,8 @@ class MPU(SimObject):
     in_port = ResponsePort("Port to receive updates from outside")
     out_port  = RequestPort("Port to send updates to the outside")
 
+    out_ports = VectorRequestPort("Ports to remote MPUs ")
+
     wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
                                 "MPU object.")
     coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for "
@@ -46,3 +48,5 @@ class MPU(SimObject):
     push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
                                 "instance of MPU object.")
 
+    update_queue_size = Param.Int(16, "Maximum number of entries "
+                                    "for each update queue.")
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 63aa474542..8897e5a959 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -29,23 +29,32 @@
 #include "accl/graph/sega/mpu.hh"
 
 #include "accl/graph/sega/centeral_controller.hh"
+#include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
 
 namespace gem5
 {
 
 MPU::MPU(const Params& params):
-    SimObject(params),
+    ClockedObject(params),
     system(params.system),
     wlEngine(params.wl_engine),
     coalesceEngine(params.coalesce_engine),
     pushEngine(params.push_engine),
     inPort(name() + ".inPort", this),
-    outPort(name() + ".outPort", this)
+    outPort(name() + ".outPort", this),
+    updateQueueSize(params.update_queue_size),
+    nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name())
 {
     wlEngine->registerMPU(this);
     coalesceEngine->registerMPU(this);
     pushEngine->registerMPU(this);
+
+
+    for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
+
+        outports.emplace_back(name() + ".out_ports" + std::to_string(i), this);
+    }
 }
 
 Port&
@@ -55,8 +64,10 @@ MPU::getPort(const std::string& if_name, PortID idx)
         return inPort;
     } else if (if_name == "out_port") {
         return outPort;
+    } else if (if_name == "outPorts") {
+        return outports[idx];
     } else {
-        return SimObject::getPort(if_name, idx);
+        return ClockedObject::getPort(if_name, idx);
     }
 }
 
@@ -166,6 +177,79 @@ MPU::recvWLWrite(Addr addr, WorkListItem wl)
     coalesceEngine->recvWLWrite(addr, wl);
 }
 
+bool
+MPU::enqueueUpdate(Update update)
+{
+    // Creating the packet
+    Addr dst_addr = update.dst;
+    bool found_locally = false;
+    for (auto range : localAddrRange) {
+        found_locally |= range.contains(dst_addr);
+    }
+
+    for (int i = 0; i < outports.size(); i++) {
+        AddrRangeList addrList = outports[i].getAddrRanges();
+        for (auto range : addrList) {
+            if (range.contains(dst_addr)) {
+                if (updateQueues[i].size() < updateQueueSize) {
+                    updateQueues[i].emplace_back(update, curTick());
+                    return true;
+                } else {
+                    return false;
+                }
+            }
+        }
+    }
+
+    panic("The update created does not match to any outport.");
+}
+
+template<typename T> PacketPtr
+MPU::createUpdatePacket(Addr addr, T value)
+{
+    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), 0, 0);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) 1) << 2);
+
+    // FIXME: MemCmd::UpdateWL
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
+
+    pkt->allocate();
+    // pkt->setData(data);
+    pkt->setLE<T>(value);
+
+    return pkt;
+}
+
+void
+MPU::processNextUpdatePushEvent()
+{
+    int next_time_send = 0;
+
+    for (int i = 0; i < updateQueues.size(); i++) {
+        Update update;
+        Tick entrance_tick;
+        std::tie(update, entrance_tick) = updateQueues[i].front();
+        if (outports[i].blocked()) {
+            continue;
+        }
+        PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
+        outports[i].sendPacket(pkt);
+        updateQueues[i].pop_front();
+        if (updateQueues[i].size() > 0) {
+            next_time_send += 1;
+        }
+    }
+
+    assert(!nextUpdatePushEvent.scheduled());
+    if (next_time_send > 0) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+
+
+}
+
 void
 MPU::recvVertexPush(Addr addr, WorkListItem wl)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index edf0350caf..d7042540f0 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -36,7 +36,7 @@
 #include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
-#include "sim/sim_object.hh"
+#include "sim/clocked_object.hh"
 #include "sim/system.hh"
 #include "params/MPU.hh"
 
@@ -45,7 +45,7 @@ namespace gem5
 
 class CenteralController;
 
-class MPU : public SimObject
+class MPU : public ClockedObject
 {
   private:
     class RespPort : public ResponsePort
@@ -99,6 +99,16 @@ class MPU : public SimObject
 
     AddrRangeList localAddrRange;
 
+    uint32_t updateQueueSize;
+
+    std::vector<ReqPort> outports;
+    std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
+
+    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
+
+    EventFunctionWrapper nextUpdatePushEvent;
+    void processNextUpdatePushEvent();
+
   public:
     PARAMS(MPU);
     MPU(const Params& params);
@@ -115,6 +125,7 @@ class MPU : public SimObject
     void handleIncomingWL(Addr addr, WorkListItem wl);
     bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
+    bool enqueueUpdate(Update update);
 
     int workCount() { return coalesceEngine->workCount(); }
     void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 6ff1f77c45..4546ceee47 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -224,7 +224,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
     PushInfo push_info = reqInfoMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
 
-    std::deque<CompleteEdge> edges;
+    std::deque<MetaEdge> edges;
     for (int i = 0; i < push_info.numElements; i++) {
         Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
         Addr edge_dst = edge->neighbor;
@@ -255,8 +255,8 @@ PushEngine::processNextPushEvent()
         return;
     }
 
-    std::deque<CompleteEdge>& edge_list = edgeQueue.front();
-    CompleteEdge curr_edge = edge_list.front();
+    std::deque<MetaEdge>& edge_list = edgeQueue.front();
+    MetaEdge curr_edge = edge_list.front();
 
     DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
                     __func__, curr_edge.to_string());
@@ -267,6 +267,12 @@ PushEngine::processNextPushEvent()
                             curr_edge.dst, update_value);
 
     owner->sendPacket(update);
+
+    Update update_2(curr_edge.src, curr_edge.dst, update_value);
+    (!owner->enqueueUpdate(update_2)) {
+        // edge_list.pop_front();
+        // edge_list.push_back(curr_edge);
+    }
     stats.numUpdates++;
     DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to addr: %lu "
                         "with value: %d.\n", __func__, curr_edge.src,
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 5d2277eb5a..d6763e3ab7 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -102,7 +102,7 @@ class PushEngine : public BaseMemoryEngine
 
     int onTheFlyMemReqs;
     int edgeQueueSize;
-    std::deque<std::deque<CompleteEdge>> edgeQueue;
+    std::deque<std::deque<MetaEdge>> edgeQueue;
 
     std::string workload;
     uint32_t propagate(uint32_t value, uint32_t weight);

From 32c75813f3bff0af05a960ad8b40d2f731a9296d Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 28 Sep 2022 13:20:32 -0700
Subject: [PATCH 175/247] Changing propagate function

---
 src/accl/graph/sega/PushEngine.py  |  7 ++-
 src/accl/graph/sega/push_engine.cc | 80 ++++++++++++------------------
 src/accl/graph/sega/push_engine.hh |  5 +-
 3 files changed, 41 insertions(+), 51 deletions(-)

diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index ad9ddfefcf..7dba86aff2 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -40,6 +40,9 @@ class PushEngine(BaseMemoryEngine):
     # significantly bigger than push_req_queue_size
     resp_queue_size = Param.Int("Size of the response queue in the "
                                     "push engine where it stores the "
-                                    "edges read from memory")
+                                    "edges read from memory.")
+    
+    max_propagates_per_cycle = Param.Int(4, "Maximum number of propagates "
+                                            "done per cycle.")
 
-    workload = Param.String("BFS", "Name of the workload")
+    workload = Param.String("BFS", "Name of the workload.")
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 4546ceee47..c82a4c88be 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -42,10 +42,11 @@ PushEngine::PushEngine(const Params& params):
     lastIdleEntranceTick(0),
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
+    maxPropagatesPerCycle(params.max_propagates_per_cycle),
     workload(params.workload),
     nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
-    nextPushEvent([this] { processNextPushEvent(); }, name()),
+    nextPropagateEvent([this] { processNextPropagateEvent(); }, name()),
     stats(*this)
 {}
 
@@ -55,16 +56,6 @@ PushEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
-void
-PushEngine::recvReqRetry()
-{
-    DPRINTF(PushEngine, "%s: Received a req retry.\n", __func__);
-    if (nextPushEvent.pending()) {
-        nextPushEvent.wake();
-        schedule(nextPushEvent, nextCycle());
-    }
-}
-
 bool
 PushEngine::vertexSpace()
 {
@@ -238,57 +229,52 @@ PushEngine::handleMemResp(PacketPtr pkt)
     delete pkt_data;
     delete pkt;
 
-    if ((!nextPushEvent.pending()) &&
-        (!nextPushEvent.scheduled())) {
-        schedule(nextPushEvent, nextCycle());
+    if (!nextPropagateEvent.scheduled()) {
+        schedule(nextPropagateEvent, nextCycle());
     }
     return true;
 }
 
 // TODO: Add a parameter to allow for doing multiple pushes at the same time.
 void
-PushEngine::processNextPushEvent()
+PushEngine::processNextPropagateEvent()
 {
-    if (owner->blocked()) {
-        stats.numNetBlocks++;
-        nextPushEvent.sleep();
-        return;
-    }
+    int num_propagates = 0;
+    while(true) {
+        std::deque<MetaEdge>& edge_list = edgeQueue.front();
+        MetaEdge curr_edge = edge_list.front();
 
-    std::deque<MetaEdge>& edge_list = edgeQueue.front();
-    MetaEdge curr_edge = edge_list.front();
+        DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
+                        __func__, curr_edge.to_string());
 
-    DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
-                    __func__, curr_edge.to_string());
+        uint32_t update_value = propagate(curr_edge.value, curr_edge.weight);
 
-    // TODO: Implement propagate function here
-    uint32_t update_value = propagate(curr_edge.value, curr_edge.weight);
-    PacketPtr update = createUpdatePacket<uint32_t>(
-                            curr_edge.dst, update_value);
-
-    owner->sendPacket(update);
-
-    Update update_2(curr_edge.src, curr_edge.dst, update_value);
-    (!owner->enqueueUpdate(update_2)) {
-        // edge_list.pop_front();
-        // edge_list.push_back(curr_edge);
-    }
-    stats.numUpdates++;
-    DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to addr: %lu "
-                        "with value: %d.\n", __func__, curr_edge.src,
+        Update update(curr_edge.src, curr_edge.dst, update_value);
+        edge_list.pop_front();
+        if (owner->enqueueUpdate(update)) {
+            DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to "
+                        "addr: %lu with value: %d.\n", __func__, curr_edge.src,
                         curr_edge.dst, update_value);
+            stats.numUpdates++;
+            stats.edgeQueueLatency.sample(
+            (curTick() - curr_edge.entrance) * 1e9 / getClockFrequency());
+        } else {
+            edge_list.push_back(curr_edge);
+        }
 
-    stats.edgeQueueLatency.sample(
-        (curTick() - curr_edge.entrance) * 1e9 / getClockFrequency());
-    edge_list.pop_front();
-    if (edge_list.empty()) {
-        edgeQueue.pop_front();
+        num_propagates++;
+        if (num_propagates >= maxPropagatesPerCycle) {
+            break;
+        }
+
+        if (edge_list.empty()) {
+            edgeQueue.pop_front();
+        }
     }
 
-    assert(!nextPushEvent.pending());
-    assert(!nextPushEvent.scheduled());
+    assert(!nextPropagateEvent.scheduled());
     if (!edgeQueue.empty()) {
-        schedule(nextPushEvent, nextCycle());
+        schedule(nextPropagateEvent, nextCycle());
     }
 }
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index d6763e3ab7..f3304a8e2a 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -102,6 +102,7 @@ class PushEngine : public BaseMemoryEngine
 
     int onTheFlyMemReqs;
     int edgeQueueSize;
+    int maxPropagatesPerCycle;
     std::deque<std::deque<MetaEdge>> edgeQueue;
 
     std::string workload;
@@ -117,8 +118,8 @@ class PushEngine : public BaseMemoryEngine
     MemoryEvent nextMemoryReadEvent;
     void processNextMemoryReadEvent();
 
-    MemoryEvent nextPushEvent;
-    void processNextPushEvent();
+    EventFunctionWrapper nextPropagateEvent;
+    void processNextPropagateEvent();
 
     struct PushStats : public statistics::Group
     {

From 666ab3de782318df4c94fa1baa52c94fd11b6c13 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 29 Sep 2022 14:59:25 -0700
Subject: [PATCH 176/247] Pushing on Marjan's behalf, refactored out_port to
 vector-port.

---
 configs/accl/sega-single-simple.py  |  6 +-
 configs/accl/sega-single.py         |  4 +-
 src/accl/graph/base/data_structs.hh |  8 +--
 src/accl/graph/sega/MPU.py          |  3 +-
 src/accl/graph/sega/mpu.cc          | 85 +++++++++++++++--------------
 src/accl/graph/sega/mpu.hh          | 20 ++++---
 src/accl/graph/sega/push_engine.cc  | 64 +++++++++-------------
 src/accl/graph/sega/push_engine.hh  |  3 +-
 8 files changed, 94 insertions(+), 99 deletions(-)

diff --git a/configs/accl/sega-single-simple.py b/configs/accl/sega-single-simple.py
index a87e6c53bb..92c1c9cbcb 100644
--- a/configs/accl/sega-single-simple.py
+++ b/configs/accl/sega-single-simple.py
@@ -92,10 +92,10 @@ def getRespPort(self):
     def setRespPort(self, port):
         self.mpu.in_port = port
 
-    def getReqPort(self):
-        return self.mpu.out_port
     def setReqPort(self, port):
-        self.mpu.out_port = port
+        self.mpu.out_ports = port
+    def getReqPort(self):
+        return self.mpu.out_ports
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.range = vertex_range
diff --git a/configs/accl/sega-single.py b/configs/accl/sega-single.py
index d9fe11a781..e4f7942f42 100644
--- a/configs/accl/sega-single.py
+++ b/configs/accl/sega-single.py
@@ -92,9 +92,9 @@ def setRespPort(self, port):
         self.mpu.in_port = port
 
     def getReqPort(self):
-        return self.mpu.out_port
+        return self.mpu.out_ports
     def setReqPort(self, port):
-        self.mpu.out_port = port
+        self.mpu.out_ports = port
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.range = vertex_range
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index d3db3edda5..34c8eb98ce 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -96,10 +96,10 @@ struct MetaEdge {
     uint32_t weight;
     uint32_t value;
 
-    uint64_t entrance;
-
-    MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value, uint64_t entrance):
-        src(src), dst(dst), weight(weight), value(value), entrance(entrance)
+    MetaEdge(): src(0), dst(0), weight(0), value(0) 
+    {}
+    MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value):
+        src(src), dst(dst), weight(weight), value(value)
     {}
 
     std::string to_string()
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 1ea6a868a9..aad2e060d1 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -37,9 +37,8 @@ class MPU(ClockedObject):
     system = Param.System(Parent.any, "System this MPU is a part of")
 
     in_port = ResponsePort("Port to receive updates from outside")
-    out_port  = RequestPort("Port to send updates to the outside")
 
-    out_ports = VectorRequestPort("Ports to remote MPUs ")
+    out_ports = VectorRequestPort("Outgoing ports to all MPUs")
 
     wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
                                 "MPU object.")
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 8897e5a959..f86c7e02b7 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/mpu.hh"
 
 #include "accl/graph/sega/centeral_controller.hh"
+#include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
 
@@ -42,7 +43,6 @@ MPU::MPU(const Params& params):
     coalesceEngine(params.coalesce_engine),
     pushEngine(params.push_engine),
     inPort(name() + ".inPort", this),
-    outPort(name() + ".outPort", this),
     updateQueueSize(params.update_queue_size),
     nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name())
 {
@@ -52,8 +52,9 @@ MPU::MPU(const Params& params):
 
 
     for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
-
-        outports.emplace_back(name() + ".out_ports" + std::to_string(i), this);
+        outPorts.emplace_back(
+                            name() + ".outPorts" + std::to_string(i), this, i);
+        updateQueues.emplace_back();
     }
 }
 
@@ -62,10 +63,8 @@ MPU::getPort(const std::string& if_name, PortID idx)
 {
     if (if_name == "in_port") {
         return inPort;
-    } else if (if_name == "out_port") {
-        return outPort;
-    } else if (if_name == "outPorts") {
-        return outports[idx];
+    } else if (if_name == "out_ports") {
+        return outPorts[idx];
     } else {
         return ClockedObject::getPort(if_name, idx);
     }
@@ -76,6 +75,9 @@ MPU::init()
 {
     localAddrRange = getAddrRanges();
     inPort.sendRangeChange();
+    for (int i = 0; i < outPorts.size(); i++){
+        portAddrMap[outPorts[i].id()] = getAddrRanges();
+    }
 }
 
 void
@@ -137,8 +139,6 @@ MPU::ReqPort::sendPacket(PacketPtr pkt)
     if (!sendTimingReq(pkt))
     {
         blockedPacket = pkt;
-    } else {
-        owner->recvReqRetry();
     }
 }
 
@@ -157,6 +157,17 @@ MPU::ReqPort::recvReqRetry()
     PacketPtr pkt = blockedPacket;
     blockedPacket = nullptr;
     sendPacket(pkt);
+    if (blockedPacket == nullptr) {
+        owner->recvReqRetry();
+    }
+}
+
+void
+MPU::recvReqRetry()
+{
+    if (!nextUpdatePushEvent.scheduled()) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
 }
 
 bool
@@ -180,28 +191,34 @@ MPU::recvWLWrite(Addr addr, WorkListItem wl)
 bool
 MPU::enqueueUpdate(Update update)
 {
-    // Creating the packet
     Addr dst_addr = update.dst;
     bool found_locally = false;
+    bool accepted = false;
     for (auto range : localAddrRange) {
         found_locally |= range.contains(dst_addr);
     }
-
-    for (int i = 0; i < outports.size(); i++) {
-        AddrRangeList addrList = outports[i].getAddrRanges();
-        for (auto range : addrList) {
+    DPRINTF(MPU, "%s: TESSSSTSSSS %d, %d, %llu.\n",
+                    __func__, outPorts.size(), updateQueues[0].size(), dst_addr);
+    for (int i = 0; i < outPorts.size(); i++) {
+        AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
+        for (auto range : addr_range_list) {
             if (range.contains(dst_addr)) {
                 if (updateQueues[i].size() < updateQueueSize) {
+                    DPRINTF(MPU, "%s: Queue %d received an update.\n",
+                                        __func__, i);
                     updateQueues[i].emplace_back(update, curTick());
-                    return true;
-                } else {
-                    return false;
+                    accepted = true;
+                    break;
                 }
             }
         }
     }
 
-    panic("The update created does not match to any outport.");
+    if (accepted && (!nextUpdatePushEvent.scheduled())) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+
+    return accepted;
 }
 
 template<typename T> PacketPtr
@@ -228,14 +245,19 @@ MPU::processNextUpdatePushEvent()
     int next_time_send = 0;
 
     for (int i = 0; i < updateQueues.size(); i++) {
+        if (updateQueues[i].empty()) {
+            continue;
+        }
+        if (outPorts[i].blocked()) {
+            continue;
+        }
         Update update;
         Tick entrance_tick;
         std::tie(update, entrance_tick) = updateQueues[i].front();
-        if (outports[i].blocked()) {
-            continue;
-        }
         PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
-        outports[i].sendPacket(pkt);
+        outPorts[i].sendPacket(pkt);
+        DPRINTF(MPU, "%s: Sent update from addr: %lu to addr: %lu with value: "
+                    "%d.\n", __func__, update.src, update.dst, update.value);
         updateQueues[i].pop_front();
         if (updateQueues[i].size() > 0) {
             next_time_send += 1;
@@ -256,25 +278,6 @@ MPU::recvVertexPush(Addr addr, WorkListItem wl)
     pushEngine->recvVertexPush(addr, wl);
 }
 
-void
-MPU::sendPacket(PacketPtr pkt)
-{
-    bool found_locally = false;
-    for (auto range : localAddrRange) {
-        found_locally |= range.contains(pkt->getAddr());
-    }
-
-    if (found_locally) {
-        // TODO: count number of local updates
-
-    } else {
-        // TOOD: count number of remote updates
-
-    }
-
-    outPort.sendPacket(pkt);
-}
-
 void
 MPU::recvDoneSignal()
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index d7042540f0..1a642e7873 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -29,6 +29,9 @@
 #ifndef __ACCL_GRAPH_SEGA_MPU_HH__
 #define __ACCL_GRAPH_SEGA_MPU_HH__
 
+#include <unordered_map>
+#include <vector>
+
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "accl/graph/sega/push_engine.hh"
@@ -74,13 +77,16 @@ class MPU : public ClockedObject
       private:
         MPU* owner;
         PacketPtr blockedPacket;
+        PortID _id;
 
       public:
-        ReqPort(const std::string& name, MPU* owner) :
-          RequestPort(name, owner), owner(owner), blockedPacket(nullptr)
+        ReqPort(const std::string& name, MPU* owner, PortID id) :
+          RequestPort(name, owner), 
+          owner(owner), blockedPacket(nullptr), _id(id)
         {}
         void sendPacket(PacketPtr pkt);
         bool blocked() { return (blockedPacket != nullptr); }
+        PortID id() { return _id; }
 
       protected:
         virtual bool recvTimingResp(PacketPtr pkt);
@@ -95,15 +101,17 @@ class MPU : public ClockedObject
     PushEngine* pushEngine;
 
     RespPort inPort;
-    ReqPort outPort;
 
     AddrRangeList localAddrRange;
 
     uint32_t updateQueueSize;
 
-    std::vector<ReqPort> outports;
+    std::unordered_map<PortID, AddrRangeList> portAddrMap;
+
+    std::vector<ReqPort> outPorts;
     std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
 
+
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     EventFunctionWrapper nextUpdatePushEvent;
@@ -133,9 +141,7 @@ class MPU : public ClockedObject
     void start() { return pushEngine->start(); }
     void recvVertexPush(Addr addr, WorkListItem wl);
 
-    bool blocked() { return outPort.blocked(); }
-    void sendPacket(PacketPtr pkt);
-    void recvReqRetry() { pushEngine->recvReqRetry(); }
+    void recvReqRetry();
 
     void recvDoneSignal();
     bool done();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c82a4c88be..d533f1ea79 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -215,15 +215,18 @@ PushEngine::handleMemResp(PacketPtr pkt)
     PushInfo push_info = reqInfoMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
 
-    std::deque<MetaEdge> edges;
+    std::deque<std::tuple<MetaEdge, Tick>> edges;
     for (int i = 0; i < push_info.numElements; i++) {
         Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
         Addr edge_dst = edge->neighbor;
         uint32_t edge_weight = edge->weight;
-        edges.emplace_back(
-            push_info.src, edge_dst, edge_weight, push_info.value, curTick());
+        MetaEdge meta_edge(
+                    push_info.src, edge_dst, edge_weight, push_info.value);
+        edges.emplace_back(meta_edge, curTick());
     }
+    assert(!edges.empty());
     edgeQueue.push_back(edges);
+
     onTheFlyMemReqs--;
     reqInfoMap.erase(pkt->req);
     delete pkt_data;
@@ -235,40 +238,44 @@ PushEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
-// TODO: Add a parameter to allow for doing multiple pushes at the same time.
 void
 PushEngine::processNextPropagateEvent()
 {
     int num_propagates = 0;
     while(true) {
-        std::deque<MetaEdge>& edge_list = edgeQueue.front();
-        MetaEdge curr_edge = edge_list.front();
+        std::deque<std::tuple<MetaEdge, Tick>>& edge_list = edgeQueue.front();
+        MetaEdge meta_edge;
+        Tick entrance_tick;
+        std::tie(meta_edge, entrance_tick) = edge_list.front();
 
         DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
-                        __func__, curr_edge.to_string());
-
-        uint32_t update_value = propagate(curr_edge.value, curr_edge.weight);
+                                __func__, meta_edge.to_string());
 
-        Update update(curr_edge.src, curr_edge.dst, update_value);
+        uint32_t update_value = propagate(meta_edge.value, meta_edge.weight);
+        Update update(meta_edge.src, meta_edge.dst, update_value);
         edge_list.pop_front();
+
         if (owner->enqueueUpdate(update)) {
-            DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to "
-                        "addr: %lu with value: %d.\n", __func__, curr_edge.src,
-                        curr_edge.dst, update_value);
+            DPRINTF(PushEngine, "%s: Sending %s to port queues.\n",
+                                            __func__, meta_edge.to_string());
             stats.numUpdates++;
             stats.edgeQueueLatency.sample(
-            (curTick() - curr_edge.entrance) * 1e9 / getClockFrequency());
+                    (curTick() - entrance_tick) * 1e9 / getClockFrequency());
         } else {
-            edge_list.push_back(curr_edge);
+            edge_list.emplace_back(meta_edge, entrance_tick);
         }
 
-        num_propagates++;
-        if (num_propagates >= maxPropagatesPerCycle) {
+        if (edge_list.empty()) {
+            edgeQueue.pop_front();
+        }
+
+        if (edgeQueue.empty()) {
             break;
         }
 
-        if (edge_list.empty()) {
-            edgeQueue.pop_front();
+        num_propagates++;
+        if (num_propagates >= maxPropagatesPerCycle) {
+            break;
         }
     }
 
@@ -278,25 +285,6 @@ PushEngine::processNextPropagateEvent()
     }
 }
 
-template<typename T> PacketPtr
-PushEngine::createUpdatePacket(Addr addr, T value)
-{
-    RequestPtr req = std::make_shared<Request>(
-                addr, sizeof(T), 0, _requestorId);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) _requestorId) << 2);
-
-    // FIXME: MemCmd::UpdateWL
-    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
-
-    pkt->allocate();
-    // pkt->setData(data);
-    pkt->setLE<T>(value);
-
-    return pkt;
-}
-
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index f3304a8e2a..fed6909733 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -103,11 +103,10 @@ class PushEngine : public BaseMemoryEngine
     int onTheFlyMemReqs;
     int edgeQueueSize;
     int maxPropagatesPerCycle;
-    std::deque<std::deque<MetaEdge>> edgeQueue;
+    std::deque<std::deque<std::tuple<MetaEdge, Tick>>> edgeQueue;
 
     std::string workload;
     uint32_t propagate(uint32_t value, uint32_t weight);
-    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     bool vertexSpace();
     bool workLeft();

From 194a5e4983af2498452daba971db27a2468148b6 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 30 Sep 2022 08:37:23 -0700
Subject: [PATCH 177/247] Attempting to add multi-inports to MPU

---
 configs/accl/sega-single-simple.py |  4 +--
 configs/accl/sega.py               | 49 ++++++++++++------------------
 src/accl/graph/sega/MPU.py         |  5 +--
 src/accl/graph/sega/mpu.cc         | 37 +++++++++++++---------
 src/accl/graph/sega/mpu.hh         | 13 ++++----
 src/accl/graph/sega/wl_engine.cc   |  2 +-
 6 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/configs/accl/sega-single-simple.py b/configs/accl/sega-single-simple.py
index 92c1c9cbcb..eacb16d3d1 100644
--- a/configs/accl/sega-single-simple.py
+++ b/configs/accl/sega-single-simple.py
@@ -88,9 +88,9 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                     )
 
     def getRespPort(self):
-        return self.mpu.in_port
+        return self.mpu.in_ports
     def setRespPort(self, port):
-        self.mpu.in_port = port
+        self.mpu.in_ports = port
 
     def setReqPort(self, port):
         self.mpu.out_ports = port
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index a67551a5fd..455d081145 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -78,18 +78,19 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.mpu = MPU(
                     wl_engine=self.wl_engine,
                     coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
+                    push_engine=self.push_engine,
+                    update_queue_size=16
                     )
 
     def getRespPort(self):
-        return self.mpu.in_port
+        return self.mpu.in_ports
     def setRespPort(self, port):
-        self.mpu.in_port = port
+        self.mpu.in_ports = port
 
     def getReqPort(self):
-        return self.mpu.out_port
+        return self.mpu.out_ports
     def setReqPort(self, port):
-        self.mpu.out_port = port
+        self.mpu.out_ports = port
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.dram.range = vertex_range
@@ -97,14 +98,7 @@ def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
 
 class SEGA(System):
-    def __init__(
-                self,
-                num_mpus,
-                cache_size,
-                graph_path,
-                first_addr,
-                first_value
-                ):
+    def __init__(self, num_mpus, cache_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
@@ -112,19 +106,7 @@ def __init__(
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        self.interconnect = NoncoherentXBar(
-                                            frontend_latency=1,
-                                            forward_latency=1,
-                                            response_latency=1,
-                                            width=64
-                                            )
-
-        self.ctrl = CenteralController(
-                                    init_addr=first_addr,
-                                    init_value=first_value,
-                                    image_file=f"{graph_path}/vertices"
-                                    )
-        self.ctrl.req_port = self.interconnect.cpu_side_ports
+        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
 
         vertex_ranges = interleave_addresses(
                                         AddrRange(start=0, size="4GiB"),
@@ -137,13 +119,18 @@ def __init__(
             gpt = GPT("8GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
-            gpt.setReqPort(self.interconnect.cpu_side_ports)
-            gpt.setRespPort(self.interconnect.mem_side_ports)
             gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
         self.gpts = gpts
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
+    def create_initial_bfs_update(self, init_addr, init_value):
+        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
@@ -160,10 +147,12 @@ def get_inputs():
 if __name__ == "__m5_main__":
     num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
 
-    system = SEGA(num_gpts, cache_size, graph, init_addr, init_value)
+    system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()
 
+    system.create_initial_bfs_update(init_addr, init_value)
     exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} because {exit_event.getCause()}")
+    print(f"Exited simulation at tick {m5.curTick()} " + \
+            f"because {exit_event.getCause()}")
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index aad2e060d1..aea76db86f 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -36,7 +36,8 @@ class MPU(ClockedObject):
 
     system = Param.System(Parent.any, "System this MPU is a part of")
 
-    in_port = ResponsePort("Port to receive updates from outside")
+    in_ports = VectorResponsePort("Incoming Ports to receive updates from "
+                                                "remote outside")
 
     out_ports = VectorRequestPort("Outgoing ports to all MPUs")
 
@@ -47,5 +48,5 @@ class MPU(ClockedObject):
     push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
                                 "instance of MPU object.")
 
-    update_queue_size = Param.Int(16, "Maximum number of entries "
+    update_queue_size = Param.Int("Maximum number of entries "
                                     "for each update queue.")
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index f86c7e02b7..4a80b22979 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -42,7 +42,6 @@ MPU::MPU(const Params& params):
     wlEngine(params.wl_engine),
     coalesceEngine(params.coalesce_engine),
     pushEngine(params.push_engine),
-    inPort(name() + ".inPort", this),
     updateQueueSize(params.update_queue_size),
     nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name())
 {
@@ -53,16 +52,21 @@ MPU::MPU(const Params& params):
 
     for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
         outPorts.emplace_back(
-                            name() + ".outPorts" + std::to_string(i), this, i);
+                            name() + ".out_ports" + std::to_string(i), this, i);
         updateQueues.emplace_back();
     }
+
+    for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
+        inPorts.emplace_back(
+                            name() + ".in_ports" + std::to_string(i), this, i);
+    }
 }
 
 Port&
 MPU::getPort(const std::string& if_name, PortID idx)
 {
-    if (if_name == "in_port") {
-        return inPort;
+    if (if_name == "in_ports") {
+        return inPorts[idx];
     } else if (if_name == "out_ports") {
         return outPorts[idx];
     } else {
@@ -74,9 +78,11 @@ void
 MPU::init()
 {
     localAddrRange = getAddrRanges();
-    inPort.sendRangeChange();
+    for (int i = 0; i < inPorts.size(); i++){
+        inPorts[i].sendRangeChange();
+    }
     for (int i = 0; i < outPorts.size(); i++){
-        portAddrMap[outPorts[i].id()] = getAddrRanges();
+        portAddrMap[outPorts[i].id()] = outPorts[i].getAddrRanges();
     }
 }
 
@@ -101,6 +107,14 @@ MPU::RespPort::checkRetryReq()
     }
 }
 
+void
+MPU::checkRetryReq()
+{
+    for (int i = 0; i < inPorts.size(); ++i) {
+        inPorts[i].checkRetryReq();
+    }
+}
+
 bool
 MPU::RespPort::recvTimingReq(PacketPtr pkt)
 {
@@ -197,16 +211,13 @@ MPU::enqueueUpdate(Update update)
     for (auto range : localAddrRange) {
         found_locally |= range.contains(dst_addr);
     }
-    DPRINTF(MPU, "%s: TESSSSTSSSS %d, %d, %llu.\n",
-                    __func__, outPorts.size(), updateQueues[0].size(), dst_addr);
     for (int i = 0; i < outPorts.size(); i++) {
         AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
         for (auto range : addr_range_list) {
             if (range.contains(dst_addr)) {
-                if (updateQueues[i].size() < updateQueueSize) {
-                    DPRINTF(MPU, "%s: Queue %d received an update.\n",
-                                        __func__, i);
-                    updateQueues[i].emplace_back(update, curTick());
+                if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
+                    DPRINTF(MPU, "%s: Queue %d received an update.\n", __func__, i);
+                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
                     accepted = true;
                     break;
                 }
@@ -268,8 +279,6 @@ MPU::processNextUpdatePushEvent()
     if (next_time_send > 0) {
         schedule(nextUpdatePushEvent, nextCycle());
     }
-
-
 }
 
 void
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 1a642e7873..ff17eada0e 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -56,13 +56,16 @@ class MPU : public ClockedObject
       private:
         MPU* owner;
         bool needSendRetryReq;
+        PortID _id;
 
       public:
-        RespPort(const std::string& name, MPU* owner):
-          ResponsePort(name, owner), owner(owner), needSendRetryReq(false)
+        RespPort(const std::string& name, MPU* owner, PortID id):
+          ResponsePort(name, owner), 
+          owner(owner), needSendRetryReq(false), _id(id)
         {}
         virtual AddrRangeList getAddrRanges() const;
 
+        PortID id() { return _id; }
         void checkRetryReq();
 
       protected:
@@ -100,18 +103,16 @@ class MPU : public ClockedObject
     CoalesceEngine* coalesceEngine;
     PushEngine* pushEngine;
 
-    RespPort inPort;
-
     AddrRangeList localAddrRange;
 
     uint32_t updateQueueSize;
 
     std::unordered_map<PortID, AddrRangeList> portAddrMap;
 
+    std::vector<RespPort> inPorts;
     std::vector<ReqPort> outPorts;
     std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
 
-
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     EventFunctionWrapper nextUpdatePushEvent;
@@ -129,7 +130,6 @@ class MPU : public ClockedObject
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
 
     bool handleIncomingUpdate(PacketPtr pkt);
-    void checkRetryReq() { inPort.checkRetryReq(); }
     void handleIncomingWL(Addr addr, WorkListItem wl);
     bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
@@ -142,6 +142,7 @@ class MPU : public ClockedObject
     void recvVertexPush(Addr addr, WorkListItem wl);
 
     void recvReqRetry();
+    void checkRetryReq();
 
     void recvDoneSignal();
     bool done();
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 5d4dd1723e..0267bd46b6 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -74,7 +74,7 @@ WLEngine::reduce(uint32_t update, uint32_t value)
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
-    assert(updateQueue.size() <= updateQueueSize);
+    assert((updateQueueSize == 0) || (updateQueue.size() <= updateQueueSize));
     if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
         return false;
     }

From cddd042f6330e0da3e36dc2f278898944eb30d31 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 3 Oct 2022 09:06:36 -0700
Subject: [PATCH 178/247] Moving reqPorts from MPU to PushEngine

---
 configs/accl/sega.py               |  10 +-
 src/accl/graph/sega/MPU.py         |   4 -
 src/accl/graph/sega/PushEngine.py  |   7 +-
 src/accl/graph/sega/mpu.cc         | 136 +------------------------
 src/accl/graph/sega/mpu.hh         |  36 -------
 src/accl/graph/sega/push_engine.cc | 154 ++++++++++++++++++++++++++++-
 src/accl/graph/sega/push_engine.hh |  36 +++++++
 7 files changed, 200 insertions(+), 183 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 455d081145..21a041180f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -61,7 +61,8 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    resp_queue_size=64
+                                    resp_queue_size=64,
+                                    update_queue_size=16
                                     )
 
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
@@ -78,8 +79,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.mpu = MPU(
                     wl_engine=self.wl_engine,
                     coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine,
-                    update_queue_size=16
+                    push_engine=self.push_engine
                     )
 
     def getRespPort(self):
@@ -88,9 +88,9 @@ def setRespPort(self, port):
         self.mpu.in_ports = port
 
     def getReqPort(self):
-        return self.mpu.out_ports
+        return self.push_engine.out_ports
     def setReqPort(self, port):
-        self.mpu.out_ports = port
+        self.push_engine.out_ports = port
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.dram.range = vertex_range
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index aea76db86f..3547cb8817 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -39,8 +39,6 @@ class MPU(ClockedObject):
     in_ports = VectorResponsePort("Incoming Ports to receive updates from "
                                                 "remote outside")
 
-    out_ports = VectorRequestPort("Outgoing ports to all MPUs")
-
     wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
                                 "MPU object.")
     coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for "
@@ -48,5 +46,3 @@ class MPU(ClockedObject):
     push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
                                 "instance of MPU object.")
 
-    update_queue_size = Param.Int("Maximum number of entries "
-                                    "for each update queue.")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 7dba86aff2..5e0d2b3212 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -34,6 +34,8 @@ class PushEngine(BaseMemoryEngine):
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
+    workload = Param.String("BFS", "Name of the workload.")
+
     push_req_queue_size = Param.Int("Size of the queue to "
                                     "queue push requests.")
     # resp_queue_size should probably be
@@ -45,4 +47,7 @@ class PushEngine(BaseMemoryEngine):
     max_propagates_per_cycle = Param.Int(4, "Maximum number of propagates "
                                             "done per cycle.")
 
-    workload = Param.String("BFS", "Name of the workload.")
+    update_queue_size = Param.Int("Maximum number of entries "
+                                    "for each update queue.")
+
+    out_ports = VectorRequestPort("Outgoing ports to all MPUs")
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 4a80b22979..76d7d3114f 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -41,21 +41,12 @@ MPU::MPU(const Params& params):
     system(params.system),
     wlEngine(params.wl_engine),
     coalesceEngine(params.coalesce_engine),
-    pushEngine(params.push_engine),
-    updateQueueSize(params.update_queue_size),
-    nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name())
+    pushEngine(params.push_engine)
 {
     wlEngine->registerMPU(this);
     coalesceEngine->registerMPU(this);
     pushEngine->registerMPU(this);
 
-
-    for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
-        outPorts.emplace_back(
-                            name() + ".out_ports" + std::to_string(i), this, i);
-        updateQueues.emplace_back();
-    }
-
     for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
         inPorts.emplace_back(
                             name() + ".in_ports" + std::to_string(i), this, i);
@@ -67,8 +58,6 @@ MPU::getPort(const std::string& if_name, PortID idx)
 {
     if (if_name == "in_ports") {
         return inPorts[idx];
-    } else if (if_name == "out_ports") {
-        return outPorts[idx];
     } else {
         return ClockedObject::getPort(if_name, idx);
     }
@@ -77,13 +66,9 @@ MPU::getPort(const std::string& if_name, PortID idx)
 void
 MPU::init()
 {
-    localAddrRange = getAddrRanges();
     for (int i = 0; i < inPorts.size(); i++){
         inPorts[i].sendRangeChange();
     }
-    for (int i = 0; i < outPorts.size(); i++){
-        portAddrMap[outPorts[i].id()] = outPorts[i].getAddrRanges();
-    }
 }
 
 void
@@ -144,46 +129,6 @@ MPU::RespPort::recvRespRetry()
     panic("recvRespRetry from response port is called.");
 }
 
-void
-MPU::ReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(blockedPacket != nullptr,
-            "Should never try to send if blocked!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-    }
-}
-
-bool
-MPU::ReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-MPU::ReqPort::recvReqRetry()
-{
-    panic_if(blockedPacket == nullptr,
-            "Received retry without a blockedPacket.");
-
-    PacketPtr pkt = blockedPacket;
-    blockedPacket = nullptr;
-    sendPacket(pkt);
-    if (blockedPacket == nullptr) {
-        owner->recvReqRetry();
-    }
-}
-
-void
-MPU::recvReqRetry()
-{
-    if (!nextUpdatePushEvent.scheduled()) {
-        schedule(nextUpdatePushEvent, nextCycle());
-    }
-}
-
 bool
 MPU::handleIncomingUpdate(PacketPtr pkt)
 {
@@ -202,85 +147,6 @@ MPU::recvWLWrite(Addr addr, WorkListItem wl)
     coalesceEngine->recvWLWrite(addr, wl);
 }
 
-bool
-MPU::enqueueUpdate(Update update)
-{
-    Addr dst_addr = update.dst;
-    bool found_locally = false;
-    bool accepted = false;
-    for (auto range : localAddrRange) {
-        found_locally |= range.contains(dst_addr);
-    }
-    for (int i = 0; i < outPorts.size(); i++) {
-        AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
-        for (auto range : addr_range_list) {
-            if (range.contains(dst_addr)) {
-                if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
-                    DPRINTF(MPU, "%s: Queue %d received an update.\n", __func__, i);
-                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
-                    accepted = true;
-                    break;
-                }
-            }
-        }
-    }
-
-    if (accepted && (!nextUpdatePushEvent.scheduled())) {
-        schedule(nextUpdatePushEvent, nextCycle());
-    }
-
-    return accepted;
-}
-
-template<typename T> PacketPtr
-MPU::createUpdatePacket(Addr addr, T value)
-{
-    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), 0, 0);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) 1) << 2);
-
-    // FIXME: MemCmd::UpdateWL
-    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
-
-    pkt->allocate();
-    // pkt->setData(data);
-    pkt->setLE<T>(value);
-
-    return pkt;
-}
-
-void
-MPU::processNextUpdatePushEvent()
-{
-    int next_time_send = 0;
-
-    for (int i = 0; i < updateQueues.size(); i++) {
-        if (updateQueues[i].empty()) {
-            continue;
-        }
-        if (outPorts[i].blocked()) {
-            continue;
-        }
-        Update update;
-        Tick entrance_tick;
-        std::tie(update, entrance_tick) = updateQueues[i].front();
-        PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
-        outPorts[i].sendPacket(pkt);
-        DPRINTF(MPU, "%s: Sent update from addr: %lu to addr: %lu with value: "
-                    "%d.\n", __func__, update.src, update.dst, update.value);
-        updateQueues[i].pop_front();
-        if (updateQueues[i].size() > 0) {
-            next_time_send += 1;
-        }
-    }
-
-    assert(!nextUpdatePushEvent.scheduled());
-    if (next_time_send > 0) {
-        schedule(nextUpdatePushEvent, nextCycle());
-    }
-}
-
 void
 MPU::recvVertexPush(Addr addr, WorkListItem wl)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index ff17eada0e..4215f82d5b 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -75,27 +75,6 @@ class MPU : public ClockedObject
         virtual void recvRespRetry();
     };
 
-    class ReqPort : public RequestPort
-    {
-      private:
-        MPU* owner;
-        PacketPtr blockedPacket;
-        PortID _id;
-
-      public:
-        ReqPort(const std::string& name, MPU* owner, PortID id) :
-          RequestPort(name, owner), 
-          owner(owner), blockedPacket(nullptr), _id(id)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return (blockedPacket != nullptr); }
-        PortID id() { return _id; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
     System* system;
     CenteralController* centeralController;
 
@@ -103,20 +82,7 @@ class MPU : public ClockedObject
     CoalesceEngine* coalesceEngine;
     PushEngine* pushEngine;
 
-    AddrRangeList localAddrRange;
-
-    uint32_t updateQueueSize;
-
-    std::unordered_map<PortID, AddrRangeList> portAddrMap;
-
     std::vector<RespPort> inPorts;
-    std::vector<ReqPort> outPorts;
-    std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
-
-    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
-
-    EventFunctionWrapper nextUpdatePushEvent;
-    void processNextUpdatePushEvent();
 
   public:
     PARAMS(MPU);
@@ -133,7 +99,6 @@ class MPU : public ClockedObject
     void handleIncomingWL(Addr addr, WorkListItem wl);
     bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
-    bool enqueueUpdate(Update update);
 
     int workCount() { return coalesceEngine->workCount(); }
     void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
@@ -141,7 +106,6 @@ class MPU : public ClockedObject
     void start() { return pushEngine->start(); }
     void recvVertexPush(Addr addr, WorkListItem wl);
 
-    void recvReqRetry();
     void checkRetryReq();
 
     void recvDoneSignal();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d533f1ea79..70c10cc358 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -44,11 +44,40 @@ PushEngine::PushEngine(const Params& params):
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
     maxPropagatesPerCycle(params.max_propagates_per_cycle),
     workload(params.workload),
+    updateQueueSize(params.update_queue_size),
     nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextPropagateEvent([this] { processNextPropagateEvent(); }, name()),
+    nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name()),
     stats(*this)
-{}
+{
+    for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
+        outPorts.emplace_back(
+                            name() + ".out_ports" + std::to_string(i), this, i);
+        updateQueues.emplace_back();
+    }
+}
+
+Port&
+PushEngine::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "out_ports") {
+        return outPorts[idx];
+    } else if (if_name == "mem_port") {
+        return BaseMemoryEngine::getPort(if_name, idx);
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
+void
+PushEngine::init()
+{
+    localAddrRange = owner->getAddrRanges();
+    for (int i = 0; i < outPorts.size(); i++){
+        portAddrMap[outPorts[i].id()] = outPorts[i].getAddrRanges();
+    }
+}
 
 void
 PushEngine::registerMPU(MPU* mpu)
@@ -56,6 +85,46 @@ PushEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
+void
+PushEngine::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(blockedPacket != nullptr,
+            "Should never try to send if blocked!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+    }
+}
+
+bool
+PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+PushEngine::ReqPort::recvReqRetry()
+{
+    panic_if(blockedPacket == nullptr,
+            "Received retry without a blockedPacket.");
+
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+    if (blockedPacket == nullptr) {
+        owner->recvReqRetry();
+    }
+}
+
+void
+PushEngine::recvReqRetry()
+{
+    if (!nextUpdatePushEvent.scheduled()) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+}
+
 bool
 PushEngine::vertexSpace()
 {
@@ -255,7 +324,7 @@ PushEngine::processNextPropagateEvent()
         Update update(meta_edge.src, meta_edge.dst, update_value);
         edge_list.pop_front();
 
-        if (owner->enqueueUpdate(update)) {
+        if (enqueueUpdate(update)) {
             DPRINTF(PushEngine, "%s: Sending %s to port queues.\n",
                                             __func__, meta_edge.to_string());
             stats.numUpdates++;
@@ -285,6 +354,87 @@ PushEngine::processNextPropagateEvent()
     }
 }
 
+bool
+PushEngine::enqueueUpdate(Update update)
+{
+    Addr dst_addr = update.dst;
+    bool found_locally = false;
+    bool accepted = false;
+    for (auto range : localAddrRange) {
+        found_locally |= range.contains(dst_addr);
+    }
+    for (int i = 0; i < outPorts.size(); i++) {
+        AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
+        for (auto range : addr_range_list) {
+            if (range.contains(dst_addr)) {
+                if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
+                    DPRINTF(PushEngine, "%s: Queue %d received an update.\n", __func__, i);
+                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
+                    DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
+                    accepted = true;
+                    break;
+                }
+            }
+        }
+    }
+
+    if (accepted && (!nextUpdatePushEvent.scheduled())) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+
+    return accepted;
+}
+
+template<typename T> PacketPtr
+PushEngine::createUpdatePacket(Addr addr, T value)
+{
+    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), 0, 0);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) 1) << 2);
+
+    // FIXME: MemCmd::UpdateWL
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
+
+    pkt->allocate();
+    // pkt->setData(data);
+    pkt->setLE<T>(value);
+
+    return pkt;
+}
+
+void
+PushEngine::processNextUpdatePushEvent()
+{
+    int next_time_send = 0;
+
+    for (int i = 0; i < updateQueues.size(); i++) {
+        if (updateQueues[i].empty()) {
+            continue;
+        }
+        if (outPorts[i].blocked()) {
+            continue;
+        }
+        Update update;
+        Tick entrance_tick;
+        std::tie(update, entrance_tick) = updateQueues[i].front();
+        PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
+        outPorts[i].sendPacket(pkt);
+        DPRINTF(PushEngine, "%s: Sent update from addr: %lu to addr: %lu with value: "
+                    "%d.\n", __func__, update.src, update.dst, update.value);
+        updateQueues[i].pop_front();
+        DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
+        if (updateQueues[i].size() > 0) {
+            next_time_send += 1;
+        }
+    }
+
+    assert(!nextUpdatePushEvent.scheduled());
+    if (next_time_send > 0) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+}
+
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index fed6909733..99fec33f2c 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -42,6 +42,27 @@ class MPU;
 class PushEngine : public BaseMemoryEngine
 {
   private:
+    class ReqPort : public RequestPort
+    {
+      private:
+        PushEngine* owner;
+        PacketPtr blockedPacket;
+        PortID _id;
+
+      public:
+        ReqPort(const std::string& name, PushEngine* owner, PortID id) :
+          RequestPort(name, owner), 
+          owner(owner), blockedPacket(nullptr), _id(id)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return (blockedPacket != nullptr); }
+        PortID id() { return _id; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
     class EdgeReadInfoGen {
       private:
         Addr _start;
@@ -95,6 +116,8 @@ class PushEngine : public BaseMemoryEngine
     bool _running;
     Tick lastIdleEntranceTick;
 
+    AddrRangeList localAddrRange;
+
     int numPendingPulls;
     int edgePointerQueueSize;
     std::deque<EdgeReadInfoGen> edgePointerQueue;
@@ -108,6 +131,13 @@ class PushEngine : public BaseMemoryEngine
     std::string workload;
     uint32_t propagate(uint32_t value, uint32_t weight);
 
+    int updateQueueSize;
+    std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
+    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
+    bool enqueueUpdate(Update update);
+    std::unordered_map<PortID, AddrRangeList> portAddrMap;
+    std::vector<ReqPort> outPorts;
+
     bool vertexSpace();
     bool workLeft();
 
@@ -120,6 +150,9 @@ class PushEngine : public BaseMemoryEngine
     EventFunctionWrapper nextPropagateEvent;
     void processNextPropagateEvent();
 
+    EventFunctionWrapper nextUpdatePushEvent;
+    void processNextUpdatePushEvent();
+
     struct PushStats : public statistics::Group
     {
       PushStats(PushEngine &push);
@@ -147,6 +180,9 @@ class PushEngine : public BaseMemoryEngine
   public:
     PARAMS(PushEngine);
     PushEngine(const Params& params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+    virtual void init() override;
     void registerMPU(MPU* mpu);
 
     virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }

From d2e6f2e7119437f6762f03cf93f85bdb0beb67b5 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 3 Oct 2022 10:01:32 -0700
Subject: [PATCH 179/247] Moving respPorts from MPU to WLEngine

---
 configs/accl/sega.py             |  4 +-
 src/accl/graph/sega/MPU.py       |  7 +--
 src/accl/graph/sega/WLEngine.py  |  6 ++-
 src/accl/graph/sega/mpu.cc       | 79 ++-------------------------
 src/accl/graph/sega/mpu.hh       | 39 ++------------
 src/accl/graph/sega/wl_engine.cc | 93 ++++++++++++++++++++++++++++++--
 src/accl/graph/sega/wl_engine.hh | 34 ++++++++++++
 7 files changed, 140 insertions(+), 122 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 21a041180f..c6c2171315 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -83,9 +83,9 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                     )
 
     def getRespPort(self):
-        return self.mpu.in_ports
+        return self.wl_engine.in_ports
     def setRespPort(self, port):
-        self.mpu.in_ports = port
+        self.wl_engine.in_ports = port
 
     def getReqPort(self):
         return self.push_engine.out_ports
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 3547cb8817..8d2453b01c 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -27,18 +27,15 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
+from m5.SimObject import SimObject
 
-class MPU(ClockedObject):
+class MPU(SimObject):
     type = "MPU"
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = "gem5::MPU"
 
     system = Param.System(Parent.any, "System this MPU is a part of")
 
-    in_ports = VectorResponsePort("Incoming Ports to receive updates from "
-                                                "remote outside")
-
     wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
                                 "MPU object.")
     coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for "
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index a44352ab9b..91325ab53f 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -34,11 +34,15 @@ class WLEngine(BaseReduceEngine):
     cxx_header = "accl/graph/sega/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
+    in_ports = VectorResponsePort("Incoming Ports to receive updates from "
+                                                "remote outside")
+
     update_queue_size = Param.Int("Size of the queue WLEngine stores "
                                         "the incoming updates")
+
     register_file_size = Param.Int("Number of internal registers the "
                                     "WLEngine has. It can service as "
                                     "many updates as this queueu has "
-                                    "entries at the same time.") # 4 is arbitrary
+                                    "entries at the same time.")
 
     workload = Param.String('BFS',"Name of the workload")
\ No newline at end of file
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 76d7d3114f..c8d0f636f2 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -29,7 +29,6 @@
 #include "accl/graph/sega/mpu.hh"
 
 #include "accl/graph/sega/centeral_controller.hh"
-#include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
 
@@ -37,7 +36,7 @@ namespace gem5
 {
 
 MPU::MPU(const Params& params):
-    ClockedObject(params),
+    SimObject(params),
     system(params.system),
     wlEngine(params.wl_engine),
     coalesceEngine(params.coalesce_engine),
@@ -46,30 +45,10 @@ MPU::MPU(const Params& params):
     wlEngine->registerMPU(this);
     coalesceEngine->registerMPU(this);
     pushEngine->registerMPU(this);
-
-    for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
-        inPorts.emplace_back(
-                            name() + ".in_ports" + std::to_string(i), this, i);
-    }
-}
-
-Port&
-MPU::getPort(const std::string& if_name, PortID idx)
-{
-    if (if_name == "in_ports") {
-        return inPorts[idx];
-    } else {
-        return ClockedObject::getPort(if_name, idx);
-    }
 }
 
-void
-MPU::init()
-{
-    for (int i = 0; i < inPorts.size(); i++){
-        inPorts[i].sendRangeChange();
-    }
-}
+MPU::~MPU()
+{}
 
 void
 MPU::registerCenteralController(CenteralController* centeral_controller)
@@ -77,58 +56,6 @@ MPU::registerCenteralController(CenteralController* centeral_controller)
     centeralController = centeral_controller;
 }
 
-AddrRangeList
-MPU::RespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-void
-MPU::RespPort::checkRetryReq()
-{
-    if (needSendRetryReq) {
-        sendRetryReq();
-        needSendRetryReq = false;
-    }
-}
-
-void
-MPU::checkRetryReq()
-{
-    for (int i = 0; i < inPorts.size(); ++i) {
-        inPorts[i].checkRetryReq();
-    }
-}
-
-bool
-MPU::RespPort::recvTimingReq(PacketPtr pkt)
-{
-    if (!owner->handleIncomingUpdate(pkt)) {
-        needSendRetryReq = true;
-        return false;
-    }
-
-    return true;
-}
-
-Tick
-MPU::RespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-MPU::RespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-MPU::RespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
 bool
 MPU::handleIncomingUpdate(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 4215f82d5b..a1e5055226 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -38,8 +38,7 @@
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/addr_range.hh"
 #include "mem/packet.hh"
-#include "mem/port.hh"
-#include "sim/clocked_object.hh"
+#include "sim/sim_object.hh"
 #include "sim/system.hh"
 #include "params/MPU.hh"
 
@@ -48,33 +47,9 @@ namespace gem5
 
 class CenteralController;
 
-class MPU : public ClockedObject
+class MPU : public SimObject
 {
   private:
-    class RespPort : public ResponsePort
-    {
-      private:
-        MPU* owner;
-        bool needSendRetryReq;
-        PortID _id;
-
-      public:
-        RespPort(const std::string& name, MPU* owner, PortID id):
-          ResponsePort(name, owner), 
-          owner(owner), needSendRetryReq(false), _id(id)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-        PortID id() { return _id; }
-        void checkRetryReq();
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
     System* system;
     CenteralController* centeralController;
 
@@ -82,20 +57,16 @@ class MPU : public ClockedObject
     CoalesceEngine* coalesceEngine;
     PushEngine* pushEngine;
 
-    std::vector<RespPort> inPorts;
-
   public:
     PARAMS(MPU);
     MPU(const Params& params);
-    Port& getPort(const std::string& if_name,
-                PortID idx = InvalidPortID) override;
-    virtual void init() override;
+    ~MPU();
     void registerCenteralController(CenteralController* centeral_controller);
 
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
-
     bool handleIncomingUpdate(PacketPtr pkt);
+
     void handleIncomingWL(Addr addr, WorkListItem wl);
     bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
@@ -106,8 +77,6 @@ class MPU : public ClockedObject
     void start() { return pushEngine->start(); }
     void recvVertexPush(Addr addr, WorkListItem wl);
 
-    void checkRetryReq();
-
     void recvDoneSignal();
     bool done();
 };
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 0267bd46b6..9a548a3255 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -45,7 +45,30 @@ WLEngine::WLEngine(const WLEngineParams& params):
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)
-{}
+{
+    for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
+        inPorts.emplace_back(
+                            name() + ".in_ports" + std::to_string(i), this, i);
+    }
+}
+
+Port&
+WLEngine::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "in_ports") {
+        return inPorts[idx];
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
+void
+WLEngine::init()
+{
+    for (int i = 0; i < inPorts.size(); i++){
+        inPorts[i].sendRangeChange();
+    }
+}
 
 void
 WLEngine::registerMPU(MPU* mpu)
@@ -53,6 +76,70 @@ WLEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
+AddrRangeList 
+WLEngine::getAddrRanges()
+{ 
+    return owner->getAddrRanges(); 
+}
+
+void 
+WLEngine::recvFunctional(PacketPtr pkt)
+{ 
+    owner->recvFunctional(pkt); 
+}
+
+AddrRangeList
+WLEngine::RespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+void
+WLEngine::RespPort::checkRetryReq()
+{
+    if (needSendRetryReq) {
+        sendRetryReq();
+        needSendRetryReq = false;
+    }
+}
+
+bool
+WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
+{
+    if (!owner->handleIncomingUpdate(pkt)) {
+        needSendRetryReq = true;
+        return false;
+    }
+
+    return true;
+}
+
+Tick
+WLEngine::RespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+WLEngine::RespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+WLEngine::RespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+WLEngine::checkRetryReq()
+{
+    for (int i = 0; i < inPorts.size(); ++i) {
+        inPorts[i].checkRetryReq();
+    }
+}
+
 bool
 WLEngine::done()
 {
@@ -144,7 +231,7 @@ WLEngine::processNextReadEvent()
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
                             update_value, updateQueue.size(), updateQueueSize);
-                owner->checkRetryReq();
+                checkRetryReq();
                 vertexReadTime[update_addr] = curTick();
             }
         } else {
@@ -173,7 +260,7 @@ WLEngine::processNextReadEvent()
                     "from updateQueue. updateQueue.size = %d. "
                     "updateQueueSize = %d.\n", __func__, update_addr,
                     update_value, updateQueue.size(), updateQueueSize);
-        owner->checkRetryReq();
+        checkRetryReq();
     }
 
     if (!updateQueue.empty() && (!nextReadEvent.scheduled())) {
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index f888979be9..5f08678d26 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -45,8 +45,34 @@ class MPU;
 class WLEngine : public BaseReduceEngine
 {
   private:
+    class RespPort : public ResponsePort
+    {
+      private:
+        WLEngine* owner;
+        bool needSendRetryReq;
+        PortID _id;
+
+      public:
+        RespPort(const std::string& name, WLEngine* owner, PortID id):
+          ResponsePort(name, owner), 
+          owner(owner), needSendRetryReq(false), _id(id)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+        PortID id() { return _id; }
+        void checkRetryReq();
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
     MPU* owner;
 
+    std::vector<RespPort> inPorts;
+
     int updateQueueSize;
     std::deque<std::tuple<Addr, uint32_t, Tick>> updateQueue;
 
@@ -86,11 +112,19 @@ class WLEngine : public BaseReduceEngine
   public:
     PARAMS(WLEngine);
     WLEngine(const Params& params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+    virtual void init() override;
     void registerMPU(MPU* mpu);
 
+    AddrRangeList getAddrRanges();
+    void recvFunctional(PacketPtr pkt);
+
     bool handleIncomingUpdate(PacketPtr pkt);
     void handleIncomingWL(Addr addr, WorkListItem wl);
 
+    void checkRetryReq();
+
     bool done();
 };
 

From 07cfd5fbb3381f8be86224e491c0eb0dc5d9da97 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 3 Oct 2022 12:58:25 -0700
Subject: [PATCH 180/247] Updating dprintfs.

---
 src/accl/graph/sega/push_engine.cc | 50 ++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 10 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 70c10cc358..9039eb408d 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -109,10 +109,12 @@ PushEngine::ReqPort::recvReqRetry()
     panic_if(blockedPacket == nullptr,
             "Received retry without a blockedPacket.");
 
+    DPRINTF(PushEngine, "%s: ReqPort %d received a reqRetry. blockedPacket: %s.\n", __func__, _id, blockedPacket->print());
     PacketPtr pkt = blockedPacket;
     blockedPacket = nullptr;
     sendPacket(pkt);
     if (blockedPacket == nullptr) {
+        DPRINTF(PushEngine, "%s: blockedPacket sent successfully.\n", __func__);
         owner->recvReqRetry();
     }
 }
@@ -120,6 +122,7 @@ PushEngine::ReqPort::recvReqRetry()
 void
 PushEngine::recvReqRetry()
 {
+    DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__);
     if (!nextUpdatePushEvent.scheduled()) {
         schedule(nextUpdatePushEvent, nextCycle());
     }
@@ -325,7 +328,7 @@ PushEngine::processNextPropagateEvent()
         edge_list.pop_front();
 
         if (enqueueUpdate(update)) {
-            DPRINTF(PushEngine, "%s: Sending %s to port queues.\n",
+            DPRINTF(PushEngine, "%s: Sent %s to port queues.\n",
                                             __func__, meta_edge.to_string());
             stats.numUpdates++;
             stats.edgeQueueLatency.sample(
@@ -363,14 +366,17 @@ PushEngine::enqueueUpdate(Update update)
     for (auto range : localAddrRange) {
         found_locally |= range.contains(dst_addr);
     }
+    DPRINTF(PushEngine, "%s: Received update: %s.\n", __func__, update.to_string());
     for (int i = 0; i < outPorts.size(); i++) {
         AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
         for (auto range : addr_range_list) {
             if (range.contains(dst_addr)) {
+                DPRINTF(PushEngine, "%s: Update: %s belongs to port %d.\n", __func__, update.to_string(), outPorts[i].id());
+                DPRINTF(PushEngine, "%s: There are %d updates already in queue for port %d.\n", __func__, updateQueues[outPorts[i].id()].size(), outPorts[i].id());
                 if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
-                    DPRINTF(PushEngine, "%s: Queue %d received an update.\n", __func__, i);
+                    DPRINTF(PushEngine, "%s: There is a free entry available in queue %d.\n", __func__, outPorts[i].id());
                     updateQueues[outPorts[i].id()].emplace_back(update, curTick());
-                    DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
+                    DPRINTF(PushEngine, "%s: Emplaced the update at the back of queue for port %d is. Size of queue for port %d is %d.\n", __func__, outPorts[i].id(), outPorts[i].id(), updateQueues[outPorts[i].id()].size());
                     accepted = true;
                     break;
                 }
@@ -408,23 +414,47 @@ PushEngine::processNextUpdatePushEvent()
 {
     int next_time_send = 0;
 
-    for (int i = 0; i < updateQueues.size(); i++) {
-        if (updateQueues[i].empty()) {
+    // for (int i = 0; i < updateQueues.size(); i++) {
+    //     if (updateQueues[i].empty()) {
+    //         continue;
+    //     }
+    //     if (outPorts[i].blocked()) {
+    //         continue;
+    //     }
+    //     Update update;
+    //     Tick entrance_tick;
+    //     std::tie(update, entrance_tick) = updateQueues[i].front();
+    //     PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
+    //     outPorts[i].sendPacket(pkt);
+    //     DPRINTF(PushEngine, "%s: Sent update from addr: %lu to addr: %lu with value: "
+    //                 "%d.\n", __func__, update.src, update.dst, update.value);
+    //     updateQueues[i].pop_front();
+    //     DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
+    //     if (updateQueues[i].size() > 0) {
+    //         next_time_send += 1;
+    //     }
+    // }
+
+    for (int i = 0; i < outPorts.size(); i++) {
+        if (outPorts[i].blocked()) {
+            DPRINTF(PushEngine, "%s: Port %d blocked.\n", __func__, outPorts[i].id());
             continue;
         }
-        if (outPorts[i].blocked()) {
+        DPRINTF(PushEngine, "%s: Port %d available.\n", __func__, outPorts[i].id());
+        if (updateQueues[outPorts[i].id()].empty()) {
+            DPRINTF(PushEngine, "%s: Respective queue for port %d is empty.\n", __func__, outPorts[i].id());
             continue;
         }
+        DPRINTF(PushEngine, "%s: Respective queue for port %d not empty.\n", __func__, outPorts[i].id());
         Update update;
         Tick entrance_tick;
         std::tie(update, entrance_tick) = updateQueues[i].front();
         PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
         outPorts[i].sendPacket(pkt);
-        DPRINTF(PushEngine, "%s: Sent update from addr: %lu to addr: %lu with value: "
-                    "%d.\n", __func__, update.src, update.dst, update.value);
-        updateQueues[i].pop_front();
+        DPRINTF(PushEngine, "%s: Sent update: %s from queue %d to port %d.\n", __func__, outPorts[i].id(), outPorts[i].id());
+        updateQueues[outPorts[i].id()].pop_front();
         DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
-        if (updateQueues[i].size() > 0) {
+        if (updateQueues[outPorts[i].id()].size() > 0) {
             next_time_send += 1;
         }
     }

From bab798ddaa2384e934ebc1775ac5755f83affdc8 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 4 Oct 2022 12:49:29 -0700
Subject: [PATCH 181/247] Fixing the problems with retry

---
 configs/accl/sega.py               | 6 +++---
 src/accl/graph/sega/push_engine.cc | 8 ++++----
 src/accl/graph/sega/push_engine.hh | 3 ++-
 src/accl/graph/sega/wl_engine.cc   | 2 +-
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index c6c2171315..6b198c5f4a 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -48,8 +48,8 @@ class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(
-                                update_queue_size=64,
-                                register_file_size=32
+                                update_queue_size=2,
+                                register_file_size=2
                                 )
         self.coalesce_engine = CoalesceEngine(
                                             attached_memory_atom_size=32,
@@ -62,7 +62,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64,
-                                    update_queue_size=16
+                                    update_queue_size=2
                                     )
 
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 9039eb408d..238b8a89fb 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -54,7 +54,6 @@ PushEngine::PushEngine(const Params& params):
     for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
         outPorts.emplace_back(
                             name() + ".out_ports" + std::to_string(i), this, i);
-        updateQueues.emplace_back();
     }
 }
 
@@ -93,6 +92,7 @@ PushEngine::ReqPort::sendPacket(PacketPtr pkt)
     // If we can't send the packet across the port, store it for later.
     if (!sendTimingReq(pkt))
     {
+        DPRINTF(PushEngine, "%s: Packet is blocked.\n", __func__);
         blockedPacket = pkt;
     }
 }
@@ -386,7 +386,7 @@ PushEngine::enqueueUpdate(Update update)
 
     if (accepted && (!nextUpdatePushEvent.scheduled())) {
         schedule(nextUpdatePushEvent, nextCycle());
-    }
+    } 
 
     return accepted;
 }
@@ -448,10 +448,10 @@ PushEngine::processNextUpdatePushEvent()
         DPRINTF(PushEngine, "%s: Respective queue for port %d not empty.\n", __func__, outPorts[i].id());
         Update update;
         Tick entrance_tick;
-        std::tie(update, entrance_tick) = updateQueues[i].front();
+        std::tie(update, entrance_tick) = updateQueues[outPorts[i].id()].front();
         PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
         outPorts[i].sendPacket(pkt);
-        DPRINTF(PushEngine, "%s: Sent update: %s from queue %d to port %d.\n", __func__, outPorts[i].id(), outPorts[i].id());
+        DPRINTF(PushEngine, "%s: Sent update: %s from queue %d to port %d the queue size is %d.\n", __func__, update.to_string(), outPorts[i].id(), outPorts[i].id(), updateQueues[outPorts[i].id()].size());
         updateQueues[outPorts[i].id()].pop_front();
         DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
         if (updateQueues[outPorts[i].id()].size() > 0) {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 99fec33f2c..4e0cdbc526 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -132,10 +132,11 @@ class PushEngine : public BaseMemoryEngine
     uint32_t propagate(uint32_t value, uint32_t weight);
 
     int updateQueueSize;
-    std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
+    // std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
     bool enqueueUpdate(Update update);
     std::unordered_map<PortID, AddrRangeList> portAddrMap;
+    std::unordered_map<PortID, std::deque<std::tuple<Update, Tick>>> updateQueues;
     std::vector<ReqPort> outPorts;
 
     bool vertexSpace();
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 9a548a3255..116cdf3f77 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -98,8 +98,8 @@ void
 WLEngine::RespPort::checkRetryReq()
 {
     if (needSendRetryReq) {
-        sendRetryReq();
         needSendRetryReq = false;
+        sendRetryReq();
     }
 }
 

From 6140135bdc790a77b13d8026292874c3d91154fd Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 4 Oct 2022 14:10:57 -0700
Subject: [PATCH 182/247] Fixing done, code style and conifg. Adding a stat.

---
 configs/accl/sega-simple.py        |  68 ++++++-------
 configs/accl/sega-single-simple.py | 151 ----------------------------
 configs/accl/sega-single.py        | 155 -----------------------------
 configs/accl/sega.py               |  14 +--
 src/accl/graph/sega/mpu.cc         |   3 -
 src/accl/graph/sega/mpu.hh         |   1 -
 src/accl/graph/sega/push_engine.cc |  97 ++++++++++--------
 src/accl/graph/sega/push_engine.hh |   4 +-
 8 files changed, 90 insertions(+), 403 deletions(-)
 delete mode 100644 configs/accl/sega-single-simple.py
 delete mode 100644 configs/accl/sega-single.py

diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
index fffc273ee1..54a90281bf 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega-simple.py
@@ -48,20 +48,21 @@ class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(
-                                update_queue_size=64,
-                                register_file_size=32
+                                update_queue_size=128,
+                                register_file_size=64
                                 )
         self.coalesce_engine = CoalesceEngine(
                                             attached_memory_atom_size=32,
                                             cache_size=cache_size,
-                                            num_mshr_entry=32,
-                                            num_tgts_per_mshr=32,
-                                            max_resp_per_cycle=4
+                                            num_mshr_entry=64,
+                                            num_tgts_per_mshr=64,
+                                            max_resp_per_cycle=8
                                             )
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    resp_queue_size=64
+                                    resp_queue_size=64,
+                                    update_queue_size=16,
                                     )
 
         self.vertex_mem_ctrl = SimpleMemory(
@@ -88,14 +89,14 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                     )
 
     def getRespPort(self):
-        return self.mpu.in_port
+        return self.wl_engine.in_ports
     def setRespPort(self, port):
-        self.mpu.in_port = port
+        self.wl_engine.in_ports = port
 
     def getReqPort(self):
-        return self.mpu.out_port
+        return self.push_engine.out_ports
     def setReqPort(self, port):
-        self.mpu.out_port = port
+        self.push_engine.out_ports = port
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.range = vertex_range
@@ -103,54 +104,39 @@ def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.image_file = edge_image
 
 class SEGA(System):
-    def __init__(
-                self,
-                num_mpus,
-                cache_size,
-                graph_path,
-                first_addr,
-                first_value
-                ):
+    def __init__(self, num_mpus, cache_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '4GHz'
+        self.clk_domain.clock = '2GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        self.interconnect = NoncoherentXBar(
-                                            frontend_latency=1,
-                                            forward_latency=1,
-                                            response_latency=1,
-                                            width=64
-                                            )
-
-        self.ctrl = CenteralController(
-                                    init_addr=first_addr,
-                                    init_value=first_value,
-                                    image_file=f"{graph_path}/vertices"
-                                    )
-
-        self.ctrl.req_port = self.interconnect.cpu_side_ports
+        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
 
         vertex_ranges = interleave_addresses(
-                                            AddrRange(start=0, size="4GiB"),
-                                            num_mpus,
-                                            32
-                                            )
+                                        AddrRange(start=0, size="4GiB"),
+                                        num_mpus,
+                                        32
+                                        )
 
         gpts = []
         for i in range(num_mpus):
             gpt = GPT("8GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
-            gpt.setReqPort(self.interconnect.cpu_side_ports)
-            gpt.setRespPort(self.interconnect.mem_side_ports)
             gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
         self.gpts = gpts
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
+    def create_initial_bfs_update(self, init_addr, init_value):
+        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
@@ -167,11 +153,13 @@ def get_inputs():
 if __name__ == "__m5_main__":
     num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
 
-    system = SEGA(num_gpts, cache_size, graph, init_addr, init_value)
+    system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()
 
+    system.create_initial_bfs_update(init_addr, init_value)
+
     exit_event = m5.simulate()
     print(f"Exited simulation at tick {m5.curTick()} " + \
             f"because {exit_event.getCause()}")
diff --git a/configs/accl/sega-single-simple.py b/configs/accl/sega-single-simple.py
deleted file mode 100644
index eacb16d3d1..0000000000
--- a/configs/accl/sega-single-simple.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import m5
-import argparse
-
-from math import log
-from m5.objects import *
-
-def interleave_addresses(plain_range, num_channels, cache_line_size):
-        intlv_low_bit = log(cache_line_size, 2)
-        intlv_bits = log(num_channels, 2)
-        ret = []
-        for i in range(num_channels):
-            ret.append(AddrRange(
-                start=plain_range.start,
-                size=plain_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                xorHighBit=0,
-                intlvBits=intlv_bits,
-                intlvMatch=i))
-        return ret
-
-class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str, cache_size: str):
-        super().__init__()
-        self.wl_engine = WLEngine(
-                                update_queue_size=64,
-                                register_file_size=32
-                                )
-        self.coalesce_engine = CoalesceEngine(
-                                            attached_memory_atom_size=32,
-                                            cache_size=cache_size,
-                                            num_mshr_entry=32,
-                                            num_tgts_per_mshr=32,
-                                            max_resp_per_cycle=4
-                                            )
-        self.push_engine = PushEngine(
-                                    push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=64
-                                    )
-
-        self.vertex_mem_ctrl = SimpleMemory(
-                                        latency="30ns",
-                                        latency_var="0ns",
-                                        bandwidth="0GB/s"
-                                        )
-
-        self.edge_mem_ctrl = SimpleMemory(
-                                        latency="30ns",
-                                        latency_var="0ns",
-                                        bandwidth="32GB/s",
-                                        range=AddrRange(edge_memory_size),
-                                        in_addr_map=False
-                                        )
-
-        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-        self.push_engine.mem_port = self.edge_mem_ctrl.port
-
-        self.mpu = MPU(
-                    wl_engine=self.wl_engine,
-                    coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
-                    )
-
-    def getRespPort(self):
-        return self.mpu.in_ports
-    def setRespPort(self, port):
-        self.mpu.in_ports = port
-
-    def setReqPort(self, port):
-        self.mpu.out_ports = port
-    def getReqPort(self):
-        return self.mpu.out_ports
-
-    def set_vertex_range(self, vertex_range):
-        self.vertex_mem_ctrl.range = vertex_range
-
-    def set_vertex_image(self, vertex_image):
-        self.vertex_mem_ctrl.image_file = vertex_image
-    def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.image_file = edge_image
-
-class SEGA(System):
-    def __init__(self, cache_size, graph_path):
-        super(SEGA, self).__init__()
-        self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
-        self.clk_domain.voltage_domain = VoltageDomain()
-        self.cache_line_size = 32
-        self.mem_mode = "timing"
-
-        gpts = [GPT("8GiB", cache_size)]
-        gpts[0].set_vertex_range(AddrRange("4GiB"))
-        gpts[0].set_edge_image(f"{graph_path}/edgelist_0")
-        gpts[0].setReqPort(gpts[0].getRespPort())
-        self.gpts = gpts
-
-        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
-        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
-
-    def create_initial_bfs_update(self, init_addr, init_value):
-        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-
-    args = argparser.parse_args()
-
-    return args.cache_size, args.graph, args.init_addr, args.init_value
-
-if __name__ == "__m5_main__":
-    cache_size, graph, init_addr, init_value = get_inputs()
-
-    system = SEGA(cache_size, graph)
-    root = Root(full_system = False, system = system)
-
-    m5.instantiate()
-
-    system.create_initial_bfs_update(init_addr, init_value)
-    exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} " + \
-            f"because {exit_event.getCause()}")
diff --git a/configs/accl/sega-single.py b/configs/accl/sega-single.py
deleted file mode 100644
index e4f7942f42..0000000000
--- a/configs/accl/sega-single.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import m5
-import argparse
-
-from math import log
-from m5.objects import *
-
-def interleave_addresses(plain_range, num_channels, cache_line_size):
-        intlv_low_bit = log(cache_line_size, 2)
-        intlv_bits = log(num_channels, 2)
-        ret = []
-        for i in range(num_channels):
-            ret.append(AddrRange(
-                start=plain_range.start,
-                size=plain_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                xorHighBit=0,
-                intlvBits=intlv_bits,
-                intlvMatch=i))
-        return ret
-
-class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str, cache_size: str):
-        super().__init__()
-        self.wl_engine = WLEngine(
-                                update_queue_size=64,
-                                register_file_size=32
-                                )
-        self.coalesce_engine = CoalesceEngine(
-                                            attached_memory_atom_size=32,
-                                            cache_size=cache_size,
-                                            num_mshr_entry=32,
-                                            num_tgts_per_mshr=32,
-                                            max_resp_per_cycle=4
-                                            )
-        self.push_engine = PushEngine(
-                                    push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=64
-                                    )
-
-        self.vertex_mem_ctrl = SimpleMemory(
-                                            latency="30ns",
-                                            latency_var="0ns",
-                                            bandwidth="32GiB/s"
-                                        )
-
-        self.edge_mem_ctrl = MemCtrl(
-                                    dram=DDR4_2400_8x8(
-                                        range=AddrRange(edge_memory_size),
-                                        in_addr_map=False
-                                    )
-                                )
-
-        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-        self.push_engine.mem_port = self.edge_mem_ctrl.port
-
-        self.mpu = MPU(
-                    wl_engine=self.wl_engine,
-                    coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
-                    )
-
-    def getRespPort(self):
-        return self.mpu.in_port
-    def setRespPort(self, port):
-        self.mpu.in_port = port
-
-    def getReqPort(self):
-        return self.mpu.out_ports
-    def setReqPort(self, port):
-        self.mpu.out_ports = port
-
-    def set_vertex_range(self, vertex_range):
-        self.vertex_mem_ctrl.range = vertex_range
-
-    def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.dram.image_file = edge_image
-
-class SEGA(System):
-    def __init__(self, cache_size, graph_path):
-        super(SEGA, self).__init__()
-        self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
-        self.clk_domain.voltage_domain = VoltageDomain()
-        self.cache_line_size = 32
-        self.mem_mode = "timing"
-
-        plain_vertex_range = AddrRange("4GiB")
-        self._vertex_ranges = interleave_addresses(
-                                            plain_vertex_range,
-                                            1,
-                                            32
-                                            )
-
-        gpts = [GPT("8GiB", cache_size)]
-        gpts[0].set_vertex_ranges(self._vertex_ranges[0])
-        gpts[0].set_edge_image(f"{graph_path}/edgelist_0")
-        gpts[0].setReqPort(gpts[0].getRespPort())
-        self.gpts = gpts
-
-        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
-        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
-
-    def create_initial_bfs_update(self, init_addr, init_value):
-        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-
-    args = argparser.parse_args()
-
-    return args.cache_size, args.graph, args.init_addr, args.init_value
-
-if __name__ == "__m5_main__":
-    cache_size, graph, init_addr, init_value = get_inputs()
-
-    system = SEGA(cache_size, graph)
-    root = Root(full_system = False, system = system)
-
-    m5.instantiate()
-
-    system.create_initial_bfs_update(init_addr, init_value)
-    exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} " + \
-            f"because {exit_event.getCause()}")
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 6b198c5f4a..fab414f2c5 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -48,21 +48,21 @@ class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(
-                                update_queue_size=2,
-                                register_file_size=2
+                                update_queue_size=128,
+                                register_file_size=64
                                 )
         self.coalesce_engine = CoalesceEngine(
                                             attached_memory_atom_size=32,
                                             cache_size=cache_size,
-                                            num_mshr_entry=32,
-                                            num_tgts_per_mshr=32,
-                                            max_resp_per_cycle=4
+                                            num_mshr_entry=64,
+                                            num_tgts_per_mshr=64,
+                                            max_resp_per_cycle=8
                                             )
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64,
-                                    update_queue_size=2
+                                    update_queue_size=16
                                     )
 
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
@@ -101,7 +101,7 @@ class SEGA(System):
     def __init__(self, num_mpus, cache_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '1GHz'
+        self.clk_domain.clock = '2GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index c8d0f636f2..44054d1efb 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -47,9 +47,6 @@ MPU::MPU(const Params& params):
     pushEngine->registerMPU(this);
 }
 
-MPU::~MPU()
-{}
-
 void
 MPU::registerCenteralController(CenteralController* centeral_controller)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index a1e5055226..229bd28950 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -60,7 +60,6 @@ class MPU : public SimObject
   public:
     PARAMS(MPU);
     MPU(const Params& params);
-    ~MPU();
     void registerCenteralController(CenteralController* centeral_controller);
 
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 238b8a89fb..5835b61fc6 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -53,7 +53,7 @@ PushEngine::PushEngine(const Params& params):
 {
     for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
         outPorts.emplace_back(
-                            name() + ".out_ports" + std::to_string(i), this, i);
+                        name() + ".out_ports" + std::to_string(i), this, i);
     }
 }
 
@@ -144,9 +144,12 @@ PushEngine::workLeft()
 bool
 PushEngine::done()
 {
-    return edgeQueue.empty() &&
-            (onTheFlyMemReqs == 0) &&
-            edgePointerQueue.empty();
+    bool empty_update_queues = true;
+    for (int i = 0; i < outPorts.size(); i++) {
+        empty_update_queues &= updateQueues[outPorts[i].id()].empty();
+    }
+    return empty_update_queues && edgeQueue.empty() &&
+        (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
 }
 
 
@@ -357,6 +360,16 @@ PushEngine::processNextPropagateEvent()
     }
 }
 
+bool
+contains(AddrRangeList range_list, Addr addr)
+{
+    bool found = false;
+    for (auto range: range_list) {
+        found |= range.contains(addr);
+    }
+    return found;
+}
+
 bool
 PushEngine::enqueueUpdate(Update update)
 {
@@ -369,24 +382,32 @@ PushEngine::enqueueUpdate(Update update)
     DPRINTF(PushEngine, "%s: Received update: %s.\n", __func__, update.to_string());
     for (int i = 0; i < outPorts.size(); i++) {
         AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
-        for (auto range : addr_range_list) {
-            if (range.contains(dst_addr)) {
-                DPRINTF(PushEngine, "%s: Update: %s belongs to port %d.\n", __func__, update.to_string(), outPorts[i].id());
-                DPRINTF(PushEngine, "%s: There are %d updates already in queue for port %d.\n", __func__, updateQueues[outPorts[i].id()].size(), outPorts[i].id());
-                if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
-                    DPRINTF(PushEngine, "%s: There is a free entry available in queue %d.\n", __func__, outPorts[i].id());
-                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
-                    DPRINTF(PushEngine, "%s: Emplaced the update at the back of queue for port %d is. Size of queue for port %d is %d.\n", __func__, outPorts[i].id(), outPorts[i].id(), updateQueues[outPorts[i].id()].size());
-                    accepted = true;
-                    break;
-                }
+        if (contains(addr_range_list, dst_addr)) {
+            DPRINTF(PushEngine, "%s: Update: %s belongs to port %d.\n",
+                        __func__, update.to_string(), outPorts[i].id());
+            DPRINTF(PushEngine, "%s: There are %d updates already "
+                        "in queue for port %d.\n", __func__,
+                        updateQueues[outPorts[i].id()].size(),
+                        outPorts[i].id());
+            if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
+                DPRINTF(PushEngine, "%s: There is a free entry available "
+                            "in queue %d.\n", __func__, outPorts[i].id());
+                updateQueues[outPorts[i].id()].emplace_back(update, curTick());
+                DPRINTF(PushEngine, "%s: Emplaced the update at the back "
+                            "of queue for port %d is. Size of queue "
+                            "for port %d is %d.\n", __func__,
+                            outPorts[i].id(), outPorts[i].id(),
+                            updateQueues[outPorts[i].id()].size());
+                accepted = true;
+                stats.updateQueueLength.sample(
+                                        updateQueues[outPorts[i].id()].size());
             }
         }
     }
 
     if (accepted && (!nextUpdatePushEvent.scheduled())) {
         schedule(nextUpdatePushEvent, nextCycle());
-    } 
+    }
 
     return accepted;
 }
@@ -414,46 +435,31 @@ PushEngine::processNextUpdatePushEvent()
 {
     int next_time_send = 0;
 
-    // for (int i = 0; i < updateQueues.size(); i++) {
-    //     if (updateQueues[i].empty()) {
-    //         continue;
-    //     }
-    //     if (outPorts[i].blocked()) {
-    //         continue;
-    //     }
-    //     Update update;
-    //     Tick entrance_tick;
-    //     std::tie(update, entrance_tick) = updateQueues[i].front();
-    //     PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
-    //     outPorts[i].sendPacket(pkt);
-    //     DPRINTF(PushEngine, "%s: Sent update from addr: %lu to addr: %lu with value: "
-    //                 "%d.\n", __func__, update.src, update.dst, update.value);
-    //     updateQueues[i].pop_front();
-    //     DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
-    //     if (updateQueues[i].size() > 0) {
-    //         next_time_send += 1;
-    //     }
-    // }
-
     for (int i = 0; i < outPorts.size(); i++) {
         if (outPorts[i].blocked()) {
-            DPRINTF(PushEngine, "%s: Port %d blocked.\n", __func__, outPorts[i].id());
+            DPRINTF(PushEngine, "%s: Port %d blocked.\n",
+                                __func__, outPorts[i].id());
             continue;
         }
-        DPRINTF(PushEngine, "%s: Port %d available.\n", __func__, outPorts[i].id());
+        DPRINTF(PushEngine, "%s: Port %d available.\n",
+                                __func__, outPorts[i].id());
         if (updateQueues[outPorts[i].id()].empty()) {
-            DPRINTF(PushEngine, "%s: Respective queue for port %d is empty.\n", __func__, outPorts[i].id());
+            DPRINTF(PushEngine, "%s: Respective queue for port "
+                        "%d is empty.\n", __func__, outPorts[i].id());
             continue;
         }
-        DPRINTF(PushEngine, "%s: Respective queue for port %d not empty.\n", __func__, outPorts[i].id());
+        DPRINTF(PushEngine, "%s: Respective queue for port "
+                        "%d not empty.\n", __func__, outPorts[i].id());
         Update update;
         Tick entrance_tick;
         std::tie(update, entrance_tick) = updateQueues[outPorts[i].id()].front();
         PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
         outPorts[i].sendPacket(pkt);
-        DPRINTF(PushEngine, "%s: Sent update: %s from queue %d to port %d the queue size is %d.\n", __func__, update.to_string(), outPorts[i].id(), outPorts[i].id(), updateQueues[outPorts[i].id()].size());
+        DPRINTF(PushEngine, "%s: Sent update: %s to port %d. "
+                    "Respective queue size is %d.\n", __func__,
+                    update.to_string(), outPorts[i].id(),
+                    updateQueues[outPorts[i].id()].size());
         updateQueues[outPorts[i].id()].pop_front();
-        DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
         if (updateQueues[outPorts[i].id()].size() > 0) {
             next_time_send += 1;
         }
@@ -480,7 +486,9 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
     ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(),
              "Histogram of the latency of the edgePointerQueue."),
     ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
-             "Histogram of the latency of the edgeQueue.")
+             "Histogram of the latency of the edgeQueue."),
+    ADD_STAT(updateQueueLength, statistics::units::Count::get(),
+             "Histogram of the length of updateQueues.")
 {
 }
 
@@ -493,6 +501,7 @@ PushEngine::PushStats::regStats()
 
     edgePointerQueueLatency.init(64);
     edgeQueueLatency.init(64);
+    updateQueueLength.init(64);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 4e0cdbc526..fbe527bcb6 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -51,7 +51,7 @@ class PushEngine : public BaseMemoryEngine
 
       public:
         ReqPort(const std::string& name, PushEngine* owner, PortID id) :
-          RequestPort(name, owner), 
+          RequestPort(name, owner),
           owner(owner), blockedPacket(nullptr), _id(id)
         {}
         void sendPacket(PacketPtr pkt);
@@ -132,7 +132,6 @@ class PushEngine : public BaseMemoryEngine
     uint32_t propagate(uint32_t value, uint32_t weight);
 
     int updateQueueSize;
-    // std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
     bool enqueueUpdate(Update update);
     std::unordered_map<PortID, AddrRangeList> portAddrMap;
@@ -170,6 +169,7 @@ class PushEngine : public BaseMemoryEngine
 
       statistics::Histogram edgePointerQueueLatency;
       statistics::Histogram edgeQueueLatency;
+      statistics::Histogram updateQueueLength;
     };
 
     PushStats stats;

From 4b555f682145b9f7dbd306ac5ff7ce47a150dc03 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 6 Oct 2022 15:35:54 -0700
Subject: [PATCH 183/247] Back indent.

---
 configs/accl/sega-simple.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
index 54a90281bf..93267f0f24 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega-simple.py
@@ -31,18 +31,18 @@
 from m5.objects import *
 
 def interleave_addresses(plain_range, num_channels, cache_line_size):
-        intlv_low_bit = log(cache_line_size, 2)
-        intlv_bits = log(num_channels, 2)
-        ret = []
-        for i in range(num_channels):
-            ret.append(AddrRange(
-                start=plain_range.start,
-                size=plain_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                xorHighBit=0,
-                intlvBits=intlv_bits,
-                intlvMatch=i))
-        return ret
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(AddrRange(
+            start=plain_range.start,
+            size=plain_range.size(),
+            intlvHighBit=intlv_low_bit + intlv_bits - 1,
+            xorHighBit=0,
+            intlvBits=intlv_bits,
+            intlvMatch=i))
+    return ret
 
 class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):

From fe68447f9d5b106c6802e2cd7e5e47718c0dd83c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 7 Oct 2022 10:27:22 -0700
Subject: [PATCH 184/247] Fixed HBM range issue.

---
 configs/accl/sega-hbm.py                  | 163 ++++++++++++++++++++++
 src/accl/graph/sega/base_memory_engine.cc |  11 +-
 src/accl/graph/sega/coalesce_engine.cc    |  27 ----
 src/base/addr_range.hh                    |  44 +++---
 src/mem/HBMCtrl.py                        |   2 +
 src/mem/hbm_ctrl.cc                       |  10 +-
 src/mem/hbm_ctrl.hh                       |   3 +-
 7 files changed, 202 insertions(+), 58 deletions(-)
 create mode 100644 configs/accl/sega-hbm.py

diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
new file mode 100644
index 0000000000..da7d79d7fe
--- /dev/null
+++ b/configs/accl/sega-hbm.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import m5
+import argparse
+
+from math import log
+from m5.objects import *
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+        intlv_low_bit = log(cache_line_size, 2)
+        intlv_bits = log(num_channels, 2)
+        ret = []
+        for i in range(num_channels):
+            ret.append(AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i))
+        return ret
+
+class GPT(SubSystem):
+    def __init__(self, edge_memory_size: str, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+                                update_queue_size=128,
+                                register_file_size=64
+                                )
+        self.coalesce_engine = CoalesceEngine(
+                                            attached_memory_atom_size=32,
+                                            cache_size=cache_size,
+                                            num_mshr_entry=64,
+                                            num_tgts_per_mshr=64,
+                                            max_resp_per_cycle=8
+                                            )
+        self.push_engine = PushEngine(
+                                    push_req_queue_size=32,
+                                    attached_memory_atom_size=64,
+                                    resp_queue_size=64,
+                                    update_queue_size=16
+                                    )
+
+        self.vertex_mem_ctrl = HBMCtrl(dram=HBM_2000_4H_1x64(),
+                                        dram_2=HBM_2000_4H_1x64())
+
+        self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
+                                            range=AddrRange(edge_memory_size),
+                                            in_addr_map=False
+                                                    )
+                                    )
+
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+        self.push_engine.mem_port = self.edge_mem_ctrl.port
+
+        self.mpu = MPU(
+                    wl_engine=self.wl_engine,
+                    coalesce_engine=self.coalesce_engine,
+                    push_engine=self.push_engine
+                    )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def set_vertex_range(self, vertex_ranges):
+        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+    def set_vertex_pch_bit(self, pch_bit):
+        self.vertex_mem_ctrl.pch_bit = pch_bit
+    def set_edge_image(self, edge_image):
+        self.edge_mem_ctrl.dram.image_file = edge_image
+
+class SEGA(System):
+    def __init__(self, num_mpus, cache_size, graph_path):
+        super(SEGA, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = '2GHz'
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
+
+        vertex_ranges = interleave_addresses(
+                                        AddrRange(start=0, size="4GiB"),
+                                        2*num_mpus,
+                                        32
+                                        )
+
+        gpts = []
+        for i in range(num_mpus):
+            gpt = GPT("2GiB", cache_size)
+            gpt.set_vertex_range([vertex_ranges[i], vertex_ranges[i+num_mpus]])
+            gpt.set_vertex_pch_bit(8)
+            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
+            gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        self.gpts = gpts
+
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+    def create_initial_bfs_update(self, init_addr, init_value):
+        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+
+    args = argparser.parse_args()
+
+    return args.num_gpts, args.cache_size, \
+        args.graph, args.init_addr, args.init_value
+
+if __name__ == "__m5_main__":
+    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system = False, system = system)
+
+    m5.instantiate()
+
+    system.create_initial_bfs_update(init_addr, init_value)
+    exit_event = m5.simulate()
+    print(f"Exited simulation at tick {m5.curTick()} " + \
+            f"because {exit_event.getCause()}")
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index d9864664b1..9f704f71e9 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -60,13 +60,10 @@ BaseMemoryEngine::init()
 {
     AddrRangeList memory_ranges = memPort.getAddrRanges();
 
-    if (memory_ranges.size() == 2) {
-        peerMemoryRange = merge(memory_ranges.front(), memory_ranges.back());
-    } else if (memory_ranges.size() == 1) {
-        peerMemoryRange = memory_ranges.front();
-    } else {
-        panic("Received an unacceptable number of ranges from memory.");
-    }
+    assert(memory_ranges.size() == 1);
+
+    peerMemoryRange = memory_ranges.front();
+
     DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is "
             "%s. The range is %s interleaved.\n", __func__,
             peerMemoryRange.to_string(),
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 0a4a041176..f4cd6a950d 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -127,15 +127,6 @@ int
 CoalesceEngine::getBlockIndex(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
-    // bool found = false;
-    // Addr trimmed_addr;
-    // for (auto range: peerMemoryRanges) {
-    //     if (range.contains(addr)) {
-    //         trimmed_addr = range.removeIntlvBits(addr);
-    //         found = true;
-    //     }
-    // }
-    // assert(found);
     Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
     return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
 }
@@ -145,15 +136,6 @@ int
 CoalesceEngine::getBitIndexBase(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
-    // bool found = false;
-    // Addr trimmed_addr;
-    // for (auto range: peerMemoryRanges) {
-    //     if (range.contains(addr)) {
-    //         trimmed_addr = range.removeIntlvBits(addr);
-    //         found = true;
-    //     }
-    // }
-    // assert(found);
     Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
     int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
     int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
@@ -165,16 +147,7 @@ Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
 {
     assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
-    // bool found = false;
     Addr trimmed_addr = index * sizeof(WorkListItem);
-    // Addr upgraded_addr;
-    // for (auto range: peerMemoryRanges) {
-    //     if (range.contains(trimmed_addr)) {
-    //         upgraded_addr = range.addIntlvBits(trimmed_addr);
-    //         found = true;
-    //     }
-    // }
-    // assert(found);
     return peerMemoryRange.addIntlvBits(trimmed_addr);
 }
 
diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh
index a4bf581224..339fdb6c55 100644
--- a/src/base/addr_range.hh
+++ b/src/base/addr_range.hh
@@ -735,33 +735,37 @@ class AddrRange
     }
 
     friend AddrRange
-    merge(const AddrRange& left, const AddrRange& right)
+    mergePseudoChannelRanges(AddrRange left, AddrRange right, int pch_bit)
     {
         assert(left.interleaved());
         assert(right.interleaved());
         assert(left.mergesWith(right));
 
-        int bits_org = left.masks.size();
-        int bits_new = bits_org - 1;
-
-        int left_match = left.intlvMatch;
-        int right_match = right.intlvMatch;
-        assert(std::abs(left_match - right_match) == (1 << bits_new));
-
-        Addr last_mask = left.masks[left.masks.size() - 1];
-        int xor_high_bit_org = 0;
-        int xor_high_bit_new = 0;
-        if (!isPowerOf2(last_mask)) {
-            xor_high_bit_org = ceilLog2<Addr>(last_mask);
-            xor_high_bit_new = xor_high_bit_org - 2;
+        uint8_t old_left_match = left.intlvMatch;
+        uint8_t new_left_match = 0;
+        uint8_t old_right_match = right.intlvMatch;
+        uint8_t new_right_match = 0;
+        int new_bits = left.masks.size() - 1;
+
+        // assumption: masks is sorted in ascending order
+        std::vector<Addr> new_masks;
+        for (auto mask: left.masks) {
+            uint64_t lsb_mask = (mask ^ (mask - 1)) + 1;
+            if ((lsb_mask >> 1) != (1 << pch_bit)) {
+                new_masks.push_back(mask);
+                new_left_match |= ((old_left_match & 1) << new_bits);
+                new_left_match >>= 1;
+                new_right_match |= ((old_right_match & 1) << new_bits);
+                new_right_match >>= 1;
+            }
+            old_left_match >>= 1;
+            old_right_match >>= 1;
         }
-        int intlv_high_bit_org =
-                        ceilLog2<Addr>(last_mask ^ (1 << xor_high_bit_org));
-        int intlv_high_bit_new = intlv_high_bit_org - 2;
+        panic_if(new_left_match != new_right_match,
+                    "The two ranges can not be a pseudo channel pair "
+                    "given the pseudochannel bit position of params.pch_bit.");
 
-        int match = std::min(left_match, right_match);
-        return AddrRange(left._start, left._end, intlv_high_bit_new,
-                            xor_high_bit_new, bits_new, match);
+        return AddrRange(left._start, left._end, new_masks, new_left_match);
     }
 };
 
diff --git a/src/mem/HBMCtrl.py b/src/mem/HBMCtrl.py
index 0c7c1ea919..f7355d4b67 100644
--- a/src/mem/HBMCtrl.py
+++ b/src/mem/HBMCtrl.py
@@ -42,6 +42,8 @@ class HBMCtrl(MemCtrl):
     # HBMCtrl has been tested with two HBM_2000_4H_1x64 interfaces
     dram_2 = Param.DRAMInterface("DRAM memory interface")
 
+    pch_bit = Param.Int("Position of PseudoChannel bit in addresses.")
+
     # For mixed traffic, HBMCtrl with HBM_2000_4H_1x64 interfaaces
     # gives the best results with following min_r/w_per_switch
     min_reads_per_switch = 64
diff --git a/src/mem/hbm_ctrl.cc b/src/mem/hbm_ctrl.cc
index 99618c4b5f..efd46bbd54 100644
--- a/src/mem/hbm_ctrl.cc
+++ b/src/mem/hbm_ctrl.cc
@@ -45,6 +45,7 @@ namespace memory
 
 HBMCtrl::HBMCtrl(const HBMCtrlParams &p) :
     MemCtrl(p),
+    pchBit(p.pch_bit),
     retryRdReqPC1(false), retryWrReqPC1(false),
     nextReqEventPC1([this] {processNextReqEvent(pc1Int, respQueuePC1,
                          respondEventPC1, nextReqEventPC1, retryWrReqPC1);},
@@ -233,7 +234,7 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
     bool is_pc0;
 
     // TODO: make the interleaving bit across pseudo channels a parameter
-    if (bits(pkt->getAddr(), 6) == 0) {
+    if (bits(pkt->getAddr(), pchBit) == 0) {
         is_pc0 = true;
     } else {
         is_pc0 = false;
@@ -492,8 +493,11 @@ AddrRangeList
 HBMCtrl::getAddrRanges()
 {
     AddrRangeList ranges;
-    ranges.push_back(pc0Int->getAddrRange());
-    ranges.push_back(pc1Int->getAddrRange());
+    AddrRange pc0Int_range = pc0Int->getAddrRange();
+    AddrRange pc1Int_range = pc1Int->getAddrRange();
+    ranges.push_back(
+                mergePseudoChannelRanges(pc0Int_range, pc1Int_range, pchBit)
+                    );
     return ranges;
 }
 
diff --git a/src/mem/hbm_ctrl.hh b/src/mem/hbm_ctrl.hh
index c9045f0ae7..f204b8346f 100644
--- a/src/mem/hbm_ctrl.hh
+++ b/src/mem/hbm_ctrl.hh
@@ -72,7 +72,8 @@ class HBMCtrl : public MemCtrl
     }
 
   private:
-
+    // Position of the pseudochannel bit in addresses.
+    int pchBit;
     /**
      * Remember if we have to retry a request for second pseudo channel.
      */

From d30ddb5df9c64082e10ff101b4064e41bbf41029 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 7 Oct 2022 11:49:25 -0700
Subject: [PATCH 185/247] Refactoring reading edges from memory

---
 src/accl/graph/sega/push_engine.cc | 41 +++++++++++++-----------------
 src/accl/graph/sega/push_engine.hh | 10 ++++++--
 2 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 5835b61fc6..7265cec1a4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -148,7 +148,7 @@ PushEngine::done()
     for (int i = 0; i < outPorts.size(); i++) {
         empty_update_queues &= updateQueues[outPorts[i].id()].empty();
     }
-    return empty_update_queues && edgeQueue.empty() &&
+    return empty_update_queues && metaEdgeQueue.empty() &&
         (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
 }
 
@@ -230,13 +230,13 @@ PushEngine::processNextMemoryReadEvent()
         nextMemoryReadEvent.sleep();
         return;
     }
+    Addr aligned_addr, offset;
+    int num_edges;
 
-    if (edgeQueue.size() < (edgeQueueSize - onTheFlyMemReqs)) {
-        Addr aligned_addr, offset;
-        int num_edges;
-
-        EdgeReadInfoGen &curr_info = edgePointerQueue.front();
-        std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
+    EdgeReadInfoGen& curr_info = edgePointerQueue.front();
+    std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
+    if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges))) 
+    {
         DPRINTF(PushEngine, "%s: Current packet information generated by "
                     "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, "
                     "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
@@ -246,8 +246,9 @@ PushEngine::processNextMemoryReadEvent()
         reqInfoMap[pkt->req] = push_info;
 
         memPort.sendPacket(pkt);
-        onTheFlyMemReqs++;
+        onTheFlyMemReqs += num_edges;
 
+        curr_info.iterate();
         if (curr_info.done()) {
             DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
             stats.edgePointerQueueLatency.sample(
@@ -290,19 +291,16 @@ PushEngine::handleMemResp(PacketPtr pkt)
     PushInfo push_info = reqInfoMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
 
-    std::deque<std::tuple<MetaEdge, Tick>> edges;
     for (int i = 0; i < push_info.numElements; i++) {
         Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
         Addr edge_dst = edge->neighbor;
         uint32_t edge_weight = edge->weight;
         MetaEdge meta_edge(
                     push_info.src, edge_dst, edge_weight, push_info.value);
-        edges.emplace_back(meta_edge, curTick());
+        metaEdgeQueue.emplace_back(meta_edge, curTick());
     }
-    assert(!edges.empty());
-    edgeQueue.push_back(edges);
 
-    onTheFlyMemReqs--;
+    onTheFlyMemReqs -= push_info.numElements;
     reqInfoMap.erase(pkt->req);
     delete pkt_data;
     delete pkt;
@@ -318,17 +316,16 @@ PushEngine::processNextPropagateEvent()
 {
     int num_propagates = 0;
     while(true) {
-        std::deque<std::tuple<MetaEdge, Tick>>& edge_list = edgeQueue.front();
         MetaEdge meta_edge;
         Tick entrance_tick;
-        std::tie(meta_edge, entrance_tick) = edge_list.front();
+        std::tie(meta_edge, entrance_tick) = metaEdgeQueue.front();
 
         DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
                                 __func__, meta_edge.to_string());
 
         uint32_t update_value = propagate(meta_edge.value, meta_edge.weight);
         Update update(meta_edge.src, meta_edge.dst, update_value);
-        edge_list.pop_front();
+        metaEdgeQueue.pop_front();
 
         if (enqueueUpdate(update)) {
             DPRINTF(PushEngine, "%s: Sent %s to port queues.\n",
@@ -337,14 +334,10 @@ PushEngine::processNextPropagateEvent()
             stats.edgeQueueLatency.sample(
                     (curTick() - entrance_tick) * 1e9 / getClockFrequency());
         } else {
-            edge_list.emplace_back(meta_edge, entrance_tick);
-        }
-
-        if (edge_list.empty()) {
-            edgeQueue.pop_front();
+            metaEdgeQueue.emplace_back(meta_edge, entrance_tick);
         }
 
-        if (edgeQueue.empty()) {
+        if (metaEdgeQueue.empty()) {
             break;
         }
 
@@ -355,7 +348,7 @@ PushEngine::processNextPropagateEvent()
     }
 
     assert(!nextPropagateEvent.scheduled());
-    if (!edgeQueue.empty()) {
+    if (!metaEdgeQueue.empty()) {
         schedule(nextPropagateEvent, nextCycle());
     }
 }
@@ -486,7 +479,7 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
     ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(),
              "Histogram of the latency of the edgePointerQueue."),
     ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
-             "Histogram of the latency of the edgeQueue."),
+             "Histogram of the latency of the metaEdgeQueue."),
     ADD_STAT(updateQueueLength, statistics::units::Count::get(),
              "Histogram of the length of updateQueues.")
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index fbe527bcb6..cc087aff11 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -93,11 +93,17 @@ class PushEngine : public BaseMemoryEngine
             } else {
                 num_items = (_end - _start) / _step;
             }
-            _start = aligned_addr + _atom;
 
             return std::make_tuple(aligned_addr, offset, num_items);
         }
 
+        void iterate()
+        {
+            panic_if(done(), "Should not call iterate when done.\n");
+            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
+            _start = aligned_addr + _atom;
+        }
+
         bool done() { return (_start >= _end); }
 
         Addr src() { return _src; }
@@ -126,7 +132,7 @@ class PushEngine : public BaseMemoryEngine
     int onTheFlyMemReqs;
     int edgeQueueSize;
     int maxPropagatesPerCycle;
-    std::deque<std::deque<std::tuple<MetaEdge, Tick>>> edgeQueue;
+    std::deque<std::tuple<MetaEdge, Tick>> metaEdgeQueue;
 
     std::string workload;
     uint32_t propagate(uint32_t value, uint32_t weight);

From 7a6ab86032f9480e0c8d733a3968aa34f8d0eea2 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 7 Oct 2022 13:33:25 -0700
Subject: [PATCH 186/247] Added statistics to calculate number of propagates
 sent

---
 src/accl/graph/sega/push_engine.cc | 10 +++++++---
 src/accl/graph/sega/push_engine.hh |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 7265cec1a4..4b3277d3e1 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -336,17 +336,18 @@ PushEngine::processNextPropagateEvent()
         } else {
             metaEdgeQueue.emplace_back(meta_edge, entrance_tick);
         }
+        num_propagates++;
 
         if (metaEdgeQueue.empty()) {
             break;
         }
-
-        num_propagates++;
         if (num_propagates >= maxPropagatesPerCycle) {
             break;
         }
     }
 
+    stats.numPropagates.sample(num_propagates);
+
     assert(!nextPropagateEvent.scheduled());
     if (!metaEdgeQueue.empty()) {
         schedule(nextPropagateEvent, nextCycle());
@@ -481,7 +482,9 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
     ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
              "Histogram of the latency of the metaEdgeQueue."),
     ADD_STAT(updateQueueLength, statistics::units::Count::get(),
-             "Histogram of the length of updateQueues.")
+             "Histogram of the length of updateQueues."),
+    ADD_STAT(numPropagates, statistics::units::Count::get(),
+             "Histogram of number of propagates sent.")
 {
 }
 
@@ -495,6 +498,7 @@ PushEngine::PushStats::regStats()
     edgePointerQueueLatency.init(64);
     edgeQueueLatency.init(64);
     updateQueueLength.init(64);
+    numPropagates.init(push.params().max_propagates_per_cycle);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index cc087aff11..c078391420 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -176,6 +176,7 @@ class PushEngine : public BaseMemoryEngine
       statistics::Histogram edgePointerQueueLatency;
       statistics::Histogram edgeQueueLatency;
       statistics::Histogram updateQueueLength;
+      statistics::Histogram numPropagates;
     };
 
     PushStats stats;

From 0bd83b6cc1c661fa484ab5d0a527d0a3d1e93722 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sat, 8 Oct 2022 16:25:41 -0700
Subject: [PATCH 187/247] Adding coalescing to pushEngine

---
 src/accl/graph/sega/push_engine.cc | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 4b3277d3e1..79e5344395 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -368,6 +368,7 @@ bool
 PushEngine::enqueueUpdate(Update update)
 {
     Addr dst_addr = update.dst;
+    bool fount_coalescing = false;
     bool found_locally = false;
     bool accepted = false;
     for (auto range : localAddrRange) {
@@ -383,7 +384,26 @@ PushEngine::enqueueUpdate(Update update)
                         "in queue for port %d.\n", __func__,
                         updateQueues[outPorts[i].id()].size(),
                         outPorts[i].id());
-            if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
+            for (auto itr = updateQueues[outPorts[i].id()].begin(); 
+                      itr != updateQueues[outPorts[i].id()].end();
+                      itr++){
+                std::tuple curr_update = *itr;
+                if (std::get<0>(curr_update).dst == update.dst){
+                    uint32_t value = 
+                        std::min(std::get<0>(curr_update).value, update.value);
+                    DPRINTF(PushEngine, "%s: found a coalescing opportunity "
+                            "for destination %d new value: %d by comparing %d "
+                            "and %d. \n", __func__, update.dst, value,
+                            std::get<0>(curr_update).value, update.value);
+                    fount_coalescing = true;
+                    update.value = value;
+                    updateQueues[outPorts[i].id()].erase(itr);
+                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
+                    break;
+                }
+            }
+            if ((fount_coalescing == false) && 
+                (updateQueues[outPorts[i].id()].size() < updateQueueSize)) {
                 DPRINTF(PushEngine, "%s: There is a free entry available "
                             "in queue %d.\n", __func__, outPorts[i].id());
                 updateQueues[outPorts[i].id()].emplace_back(update, curTick());
@@ -398,6 +418,7 @@ PushEngine::enqueueUpdate(Update update)
             }
         }
     }
+    fount_coalescing = false;
 
     if (accepted && (!nextUpdatePushEvent.scheduled())) {
         schedule(nextUpdatePushEvent, nextCycle());

From 9f052dcf27a64d21582f48f41eb032bb1fe48464 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 8 Oct 2022 19:49:58 -0700
Subject: [PATCH 188/247] Adding function to print final answer.

---
 configs/accl/sega-hbm.py                   | 18 +++--
 configs/accl/sega-simple.py                |  2 +-
 configs/accl/sega.py                       |  2 +-
 src/accl/graph/sega/CenteralController.py  |  5 +-
 src/accl/graph/sega/centeral_controller.cc | 44 +++++++++++-
 src/accl/graph/sega/centeral_controller.hh |  3 +
 src/accl/graph/sega/push_engine.cc         | 80 ++++++++++++----------
 src/accl/graph/sega/push_engine.hh         |  9 ++-
 src/base/addr_range.hh                     | 10 +++
 9 files changed, 125 insertions(+), 48 deletions(-)

diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
index da7d79d7fe..70aac6c2cb 100644
--- a/configs/accl/sega-hbm.py
+++ b/configs/accl/sega-hbm.py
@@ -61,8 +61,8 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    resp_queue_size=64,
-                                    update_queue_size=16
+                                    resp_queue_size=512,
+                                    update_queue_size=32
                                     )
 
         self.vertex_mem_ctrl = HBMCtrl(dram=HBM_2000_4H_1x64(),
@@ -136,6 +136,9 @@ def __init__(self, num_mpus, cache_size, graph_path):
     def create_initial_bfs_update(self, init_addr, init_value):
         self.ctrl.createInitialBFSUpdate(init_addr, init_value)
 
+    def print_answer(self):
+        self.ctrl.printAnswerToHostSimout()
+
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
@@ -143,14 +146,19 @@ def get_inputs():
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+    argparser.add_argument("--verify", type=bool, help="Print final answer")
 
     args = argparser.parse_args()
 
+    verify = False
+    if not args.verify is None:
+        verify = args.verify
+
     return args.num_gpts, args.cache_size, \
-        args.graph, args.init_addr, args.init_value
+        args.graph, args.init_addr, args.init_value, verify
 
 if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
+    num_gpts, cache_size, graph, init_addr, init_value, verify = get_inputs()
 
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system = False, system = system)
@@ -161,3 +169,5 @@ def get_inputs():
     exit_event = m5.simulate()
     print(f"Exited simulation at tick {m5.curTick()} " + \
             f"because {exit_event.getCause()}")
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
index 93267f0f24..7ec19c92ae 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega-simple.py
@@ -62,7 +62,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64,
-                                    update_queue_size=16,
+                                    update_queue_size=32,
                                     )
 
         self.vertex_mem_ctrl = SimpleMemory(
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index fab414f2c5..c50c525297 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -62,7 +62,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64,
-                                    update_queue_size=16
+                                    update_queue_size=32
                                     )
 
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 0721ff977c..2ba53c231f 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -41,4 +41,7 @@ class CenteralController(ClockedObject):
 
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
-    cxx_exports = [PyBindMethod("createInitialBFSUpdate")]
+    cxx_exports = [
+                    PyBindMethod("createInitialBFSUpdate"),
+                    PyBindMethod("printAnswerToHostSimout")
+                ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 68b88e9e77..7c89c1edea 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -28,6 +28,9 @@
 
 #include "accl/graph/sega/centeral_controller.hh"
 
+#include <iostream>
+
+#include "base/cprintf.hh"
 #include "base/loader/memory_image.hh"
 #include "base/loader/object_file.hh"
 #include "debug/CenteralController.hh"
@@ -62,7 +65,7 @@ CenteralController::initState()
 
     loader::debugSymbolTable.insert(*object->symtab().globals());
     loader::MemoryImage image = object->buildImage();
-    Addr maxVertexAddr = image.maxAddr();
+    maxVertexAddr = image.maxAddr();
 
     PortProxy proxy(
     [this](PacketPtr pkt) {
@@ -97,6 +100,21 @@ CenteralController::startup()
     }
 }
 
+PacketPtr
+CenteralController::createReadPacket(Addr addr, unsigned int size)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, 0);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) 0) << 2);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
+
 template<typename T> PacketPtr
 CenteralController::createUpdatePacket(Addr addr, T value)
 {
@@ -134,4 +152,28 @@ CenteralController::recvDoneSignal()
     }
 }
 
+void
+CenteralController::printAnswerToHostSimout()
+{
+    int num_items = system->cacheLineSize() / sizeof(WorkListItem);
+    WorkListItem items[num_items];
+    for (Addr addr = 0; addr < maxVertexAddr; addr += system->cacheLineSize())
+    {
+        PacketPtr pkt = createReadPacket(addr, system->cacheLineSize());
+        for (auto mpu: mpuVector) {
+            AddrRangeList range_list = addrRangeListMap[mpu];
+            if (contains(range_list, addr)) {
+                mpu->recvFunctional(pkt);
+            }
+        }
+        pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
+        for (int i = 0; i < num_items; i++) {
+            std::string print = csprintf("WorklistItem[%lu][%d]: %s.",
+                                        addr, i, items[i].to_string());
+
+            std::cout << print << std::endl;
+        }
+    }
+}
+
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 4a4e9c7cb1..d006851e3b 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -53,6 +53,7 @@ class CenteralController : public ClockedObject
     std::vector<MPU*> mpuVector;
     std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
 
+    PacketPtr createReadPacket(Addr addr, unsigned int size);
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
   public:
@@ -64,6 +65,8 @@ class CenteralController : public ClockedObject
 
     void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
     void recvDoneSignal();
+
+    void printAnswerToHostSimout();
 };
 
 }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 79e5344395..d5fb002f82 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -43,7 +43,6 @@ PushEngine::PushEngine(const Params& params):
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
     maxPropagatesPerCycle(params.max_propagates_per_cycle),
-    workload(params.workload),
     updateQueueSize(params.update_queue_size),
     nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
@@ -152,10 +151,23 @@ PushEngine::done()
         (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
 }
 
+uint32_t
+PushEngine::reduce(uint32_t update, uint32_t value)
+{
+    std::string workload = params().workload;
+    uint32_t new_value;
+    if(workload == "BFS"){
+        new_value = std::min(update, value);
+    } else{
+        panic("Workload not implemented\n");
+    }
+    return new_value;
+}
 
 uint32_t
 PushEngine::propagate(uint32_t value, uint32_t weight)
 {
+    std::string workload = params().workload;
     uint32_t update;
     if (workload == "BFS")  {
         update = value + 1;
@@ -235,7 +247,7 @@ PushEngine::processNextMemoryReadEvent()
 
     EdgeReadInfoGen& curr_info = edgePointerQueue.front();
     std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
-    if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges))) 
+    if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges)))
     {
         DPRINTF(PushEngine, "%s: Current packet information generated by "
                     "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, "
@@ -299,6 +311,8 @@ PushEngine::handleMemResp(PacketPtr pkt)
                     push_info.src, edge_dst, edge_weight, push_info.value);
         metaEdgeQueue.emplace_back(meta_edge, curTick());
     }
+    stats.numWastefulEdgesRead +=
+                (peerMemoryAtomSize / sizeof(Edge)) - push_info.numElements;
 
     onTheFlyMemReqs -= push_info.numElements;
     reqInfoMap.erase(pkt->req);
@@ -330,7 +344,7 @@ PushEngine::processNextPropagateEvent()
         if (enqueueUpdate(update)) {
             DPRINTF(PushEngine, "%s: Sent %s to port queues.\n",
                                             __func__, meta_edge.to_string());
-            stats.numUpdates++;
+            stats.numPropagates++;
             stats.edgeQueueLatency.sample(
                     (curTick() - entrance_tick) * 1e9 / getClockFrequency());
         } else {
@@ -346,7 +360,7 @@ PushEngine::processNextPropagateEvent()
         }
     }
 
-    stats.numPropagates.sample(num_propagates);
+    stats.numPropagatesHist.sample(num_propagates);
 
     assert(!nextPropagateEvent.scheduled());
     if (!metaEdgeQueue.empty()) {
@@ -354,21 +368,11 @@ PushEngine::processNextPropagateEvent()
     }
 }
 
-bool
-contains(AddrRangeList range_list, Addr addr)
-{
-    bool found = false;
-    for (auto range: range_list) {
-        found |= range.contains(addr);
-    }
-    return found;
-}
-
 bool
 PushEngine::enqueueUpdate(Update update)
 {
     Addr dst_addr = update.dst;
-    bool fount_coalescing = false;
+    bool found_coalescing = false;
     bool found_locally = false;
     bool accepted = false;
     for (auto range : localAddrRange) {
@@ -384,25 +388,21 @@ PushEngine::enqueueUpdate(Update update)
                         "in queue for port %d.\n", __func__,
                         updateQueues[outPorts[i].id()].size(),
                         outPorts[i].id());
-            for (auto itr = updateQueues[outPorts[i].id()].begin(); 
-                      itr != updateQueues[outPorts[i].id()].end();
-                      itr++){
-                std::tuple curr_update = *itr;
-                if (std::get<0>(curr_update).dst == update.dst){
-                    uint32_t value = 
-                        std::min(std::get<0>(curr_update).value, update.value);
+            for (auto& entry: updateQueues[outPorts[i].id()]) {
+                Update& curr_update = std::get<0>(entry);
+                if (curr_update.dst == update.dst) {
+                    uint32_t old_value = curr_update.value;
+                    curr_update.value = reduce(old_value, update.value);
                     DPRINTF(PushEngine, "%s: found a coalescing opportunity "
-                            "for destination %d new value: %d by comparing %d "
-                            "and %d. \n", __func__, update.dst, value,
-                            std::get<0>(curr_update).value, update.value);
-                    fount_coalescing = true;
-                    update.value = value;
-                    updateQueues[outPorts[i].id()].erase(itr);
-                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
-                    break;
+                            "for destination %d with new value: %d by "
+                            "coalescing %d and %d. \n", __func__, update.dst,
+                            curr_update.value, old_value, update.value);
+                    found_coalescing = true;
+                    accepted = true;
+                    stats.updateQueueCoalescions++;
                 }
             }
-            if ((fount_coalescing == false) && 
+            if ((found_coalescing == false) &&
                 (updateQueues[outPorts[i].id()].size() < updateQueueSize)) {
                 DPRINTF(PushEngine, "%s: There is a free entry available "
                             "in queue %d.\n", __func__, outPorts[i].id());
@@ -418,7 +418,6 @@ PushEngine::enqueueUpdate(Update update)
             }
         }
     }
-    fount_coalescing = false;
 
     if (accepted && (!nextUpdatePushEvent.scheduled())) {
         schedule(nextUpdatePushEvent, nextCycle());
@@ -478,6 +477,7 @@ PushEngine::processNextUpdatePushEvent()
         if (updateQueues[outPorts[i].id()].size() > 0) {
             next_time_send += 1;
         }
+        stats.numUpdates++;
     }
 
     assert(!nextUpdatePushEvent.scheduled());
@@ -489,12 +489,18 @@ PushEngine::processNextUpdatePushEvent()
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
-    ADD_STAT(numUpdates, statistics::units::Count::get(),
-             "Number of sent updates."),
+    ADD_STAT(numPropagates, statistics::units::Count::get(),
+             "Number of propagate operations done."),
     ADD_STAT(numNetBlocks, statistics::units::Count::get(),
              "Number of updates blocked by network."),
     ADD_STAT(numIdleCycles, statistics::units::Count::get(),
              "Number of cycles PushEngine has been idle."),
+    ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(),
+             "Number of coalescions in the update queues."),
+    ADD_STAT(numUpdates, statistics::units::Count::get(),
+             "Number of updates sent to the network."),
+    ADD_STAT(numWastefulEdgesRead, statistics::units::Count::get(),
+             "Number of wasteful edges read from edge memory."),
     ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
                                     statistics::units::Second>::get(),
              "Traversed Edges Per Second."),
@@ -504,7 +510,7 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
              "Histogram of the latency of the metaEdgeQueue."),
     ADD_STAT(updateQueueLength, statistics::units::Count::get(),
              "Histogram of the length of updateQueues."),
-    ADD_STAT(numPropagates, statistics::units::Count::get(),
+    ADD_STAT(numPropagatesHist, statistics::units::Count::get(),
              "Histogram of number of propagates sent.")
 {
 }
@@ -514,12 +520,12 @@ PushEngine::PushStats::regStats()
 {
     using namespace statistics;
 
-    TEPS = numUpdates / simSeconds;
+    TEPS = numPropagates / simSeconds;
 
     edgePointerQueueLatency.init(64);
     edgeQueueLatency.init(64);
     updateQueueLength.init(64);
-    numPropagates.init(push.params().max_propagates_per_cycle);
+    numPropagatesHist.init(push.params().max_propagates_per_cycle);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index c078391420..6163ba5c27 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -134,7 +134,7 @@ class PushEngine : public BaseMemoryEngine
     int maxPropagatesPerCycle;
     std::deque<std::tuple<MetaEdge, Tick>> metaEdgeQueue;
 
-    std::string workload;
+    uint32_t reduce(uint32_t update, uint32_t value);
     uint32_t propagate(uint32_t value, uint32_t weight);
 
     int updateQueueSize;
@@ -167,16 +167,19 @@ class PushEngine : public BaseMemoryEngine
 
       PushEngine &push;
 
-      statistics::Scalar numUpdates;
+      statistics::Scalar numPropagates;
       statistics::Scalar numNetBlocks;
       statistics::Scalar numIdleCycles;
+      statistics::Scalar updateQueueCoalescions;
+      statistics::Scalar numUpdates;
+      statistics::Scalar numWastefulEdgesRead;
 
       statistics::Formula TEPS;
 
       statistics::Histogram edgePointerQueueLatency;
       statistics::Histogram edgeQueueLatency;
       statistics::Histogram updateQueueLength;
-      statistics::Histogram numPropagates;
+      statistics::Histogram numPropagatesHist;
     };
 
     PushStats stats;
diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh
index 339fdb6c55..3c5c150b29 100644
--- a/src/base/addr_range.hh
+++ b/src/base/addr_range.hh
@@ -852,6 +852,16 @@ RangeSize(Addr start, Addr size)
     return AddrRange(start, start + size);
 }
 
+inline bool
+contains(AddrRangeList range_list, Addr addr)
+{
+    bool ret = false;
+    for (auto range: range_list) {
+        ret |= range.contains(addr);
+    }
+    return ret;
+}
+
 } // namespace gem5
 
 #endif // __BASE_ADDR_RANGE_HH__

From cc19d17fc22d22377f2d3d56c43fe981fb66f70f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 9 Oct 2022 17:15:04 -0700
Subject: [PATCH 189/247] Typos.

---
 configs/accl/real-graph-gen.py                | 74 +++++++++++++++++++
 configs/accl/sega-hbm.py                      | 14 ++--
 .../accl/{graph-gen.py => synth-graph-gen.py} |  0
 src/accl/graph/sega/centeral_controller.cc    |  2 +-
 src/accl/graph/sega/wl_engine.cc              | 12 +--
 src/accl/graph/sega/wl_engine.hh              |  2 +-
 6 files changed, 89 insertions(+), 15 deletions(-)
 create mode 100644 configs/accl/real-graph-gen.py
 rename configs/accl/{graph-gen.py => synth-graph-gen.py} (100%)

diff --git a/configs/accl/real-graph-gen.py b/configs/accl/real-graph-gen.py
new file mode 100644
index 0000000000..db44c63a9a
--- /dev/null
+++ b/configs/accl/real-graph-gen.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import argparse
+import subprocess
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("path", type=str, help="Path to the graph file.")
+    argparser.add_argument("num_gpts", type=int, help="Number gpts to create synth graph binaries for.")
+
+    args = argparser.parse_args()
+    return args.path, args.num_gpts
+
+if __name__ == "__main__":
+    graph_path, num_gpts = get_inputs()
+
+    graph_reader = os.environ.get("GRAPH_READER")
+
+    if graph_reader is None:
+        raise ValueError(f"No value for $GRAPH_READER.")
+
+    if not os.path.exists(graph_path):
+        raise ValueError(f"{graph_path} does not exist.")
+
+    graph_dir = os.path.dirname(graph_path)
+    if not "binaries" in os.listdir(graph_dir):
+        print(f"binaries directory not found in {graph_dir}")
+        os.mkdir(f"{graph_dir}/binaries")
+        print(f"Created {graph_dir}/binaries")
+
+    if not f"gpts_{num_gpts}" in os.listdir(f"{graph_dir}/binaries"):
+        print(f"gpts_{num_gpts} not found in {graph_dir}/binaries")
+        os.mkdir(f"{graph_dir}/binaries/gpts_{num_gpts}")
+        print(f"Created {graph_dir}/binaries/gpts_{num_gpts}")
+
+    expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
+    if not all([binary in os.listdir(f"{graph_dir}/binaries/gpts_{num_gpts}") for binary in expected_bins]):
+        print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}")
+        for delete in os.scandir(f"{graph_dir}/binaries/gpts_{num_gpts}"):
+            os.remove(delete.path)
+        print(f"Deleted all the files in {graph_dir}/binaries/gpts_{num_gpts}")
+        subprocess.run([f"{graph_reader}" ,
+                        f"{graph_path}",
+                        "false",
+                        f"{num_gpts}",
+                        "32",
+                        f"{graph_dir}/binaries/gpts_{num_gpts}"])
+        print(f"Created the graph binaries in "
+                f"{graph_dir}/binaries/gpts_{num_gpts}")
diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
index 70aac6c2cb..cdc752f2bd 100644
--- a/configs/accl/sega-hbm.py
+++ b/configs/accl/sega-hbm.py
@@ -42,7 +42,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
                 xorHighBit=0,
                 intlvBits=intlv_bits,
                 intlvMatch=i))
-        return ret
+        return ret, intlv_low_bit + intlv_bits - 1
 
 class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
@@ -112,17 +112,17 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
 
-        vertex_ranges = interleave_addresses(
-                                        AddrRange(start=0, size="4GiB"),
-                                        2*num_mpus,
-                                        32
-                                        )
+        vertex_ranges, pch_bit = interleave_addresses(
+                                            AddrRange(start=0, size="4GiB"),
+                                            2*num_mpus,
+                                            32
+                                            )
 
         gpts = []
         for i in range(num_mpus):
             gpt = GPT("2GiB", cache_size)
             gpt.set_vertex_range([vertex_ranges[i], vertex_ranges[i+num_mpus]])
-            gpt.set_vertex_pch_bit(8)
+            gpt.set_vertex_pch_bit(pch_bit)
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpts.append(gpt)
         # Creating the interconnect among mpus
diff --git a/configs/accl/graph-gen.py b/configs/accl/synth-graph-gen.py
similarity index 100%
rename from configs/accl/graph-gen.py
rename to configs/accl/synth-graph-gen.py
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 7c89c1edea..82e63d512e 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -168,7 +168,7 @@ CenteralController::printAnswerToHostSimout()
         }
         pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
         for (int i = 0; i < num_items; i++) {
-            std::string print = csprintf("WorklistItem[%lu][%d]: %s.",
+            std::string print = csprintf("WorkListItem[%lu][%d]: %s.",
                                         addr, i, items[i].to_string());
 
             std::cout << print << std::endl;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 116cdf3f77..eb2006a3df 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -76,16 +76,16 @@ WLEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
-AddrRangeList 
+AddrRangeList
 WLEngine::getAddrRanges()
-{ 
-    return owner->getAddrRanges(); 
+{
+    return owner->getAddrRanges();
 }
 
-void 
+void
 WLEngine::recvFunctional(PacketPtr pkt)
-{ 
-    owner->recvFunctional(pkt); 
+{
+    owner->recvFunctional(pkt);
 }
 
 AddrRangeList
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 5f08678d26..7578044cbf 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -54,7 +54,7 @@ class WLEngine : public BaseReduceEngine
 
       public:
         RespPort(const std::string& name, WLEngine* owner, PortID id):
-          ResponsePort(name, owner), 
+          ResponsePort(name, owner),
           owner(owner), needSendRetryReq(false), _id(id)
         {}
         virtual AddrRangeList getAddrRanges() const;

From 76407f72953961561a153510f3dc81723f4847e1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 11 Oct 2022 15:07:29 -0700
Subject: [PATCH 190/247] Adding functions to move value to and from float.

---
 src/accl/graph/base/data_structs.hh | 24 +++++++++++++++++++++++-
 src/accl/graph/sega/push_engine.cc  | 13 ++++++-------
 src/accl/graph/sega/push_engine.hh  | 11 ++++-------
 3 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 34c8eb98ce..3753e10d62 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -33,6 +33,8 @@
 #include "base/intmath.hh"
 
 #include <list>
+#include <cassert>
+#include <cstring>
 
 namespace gem5
 {
@@ -96,7 +98,7 @@ struct MetaEdge {
     uint32_t weight;
     uint32_t value;
 
-    MetaEdge(): src(0), dst(0), weight(0), value(0) 
+    MetaEdge(): src(0), dst(0), weight(0), value(0)
     {}
     MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value):
         src(src), dst(dst), weight(weight), value(value)
@@ -176,6 +178,26 @@ class UniqueFIFO
     }
 };
 
+template<typename T>
+float
+writeToFloat(T value)
+{
+    assert(sizeof(T) == sizeof(float));
+    float float_form;
+    std::memcpy(&float_form, &value, sizeof(float));
+    return float_form;
+}
+
+template<typename T>
+T
+readFromFloat(float value)
+{
+    assert(sizeof(T) == sizeof(float));
+    T float_bits;
+    std::memcpy(&float_bits, &value, sizeof(float));
+    return float_bits;
+}
+
 }
 
 #endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d5fb002f82..cd795eaf00 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -220,10 +220,9 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
 
-    edgePointerQueue.emplace_back(
-                            start_addr, end_addr, sizeof(Edge),
-                            peerMemoryAtomSize, addr,
-                            (uint32_t) wl.prop, curTick());
+    EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge),
+                            peerMemoryAtomSize, addr, (uint32_t) wl.prop);
+    edgePointerQueue.emplace_back(info_gen, curTick());
     numPendingPulls--;
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
@@ -245,7 +244,8 @@ PushEngine::processNextMemoryReadEvent()
     Addr aligned_addr, offset;
     int num_edges;
 
-    EdgeReadInfoGen& curr_info = edgePointerQueue.front();
+    EdgeReadInfoGen& curr_info = std::get<0>(edgePointerQueue.front());
+    Tick entrance_tick = std::get<1>(edgePointerQueue.front());
     std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
     if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges)))
     {
@@ -264,8 +264,7 @@ PushEngine::processNextMemoryReadEvent()
         if (curr_info.done()) {
             DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
             stats.edgePointerQueueLatency.sample(
-                                (curTick() - curr_info.entrance()) *
-                                1e9 / getClockFrequency());
+                    (curTick() - entrance_tick) * 1e9 / getClockFrequency());
             edgePointerQueue.pop_front();
             DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. "
             "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size());
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 6163ba5c27..acf012b24d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -73,12 +73,11 @@ class PushEngine : public BaseMemoryEngine
         Addr _src;
         uint32_t _value;
 
-        Tick _entrance;
       public:
         EdgeReadInfoGen(Addr start, Addr end, size_t step,
-                        size_t atom, Addr src, uint32_t value, Tick entrance):
-                        _start(start), _end(end), _step(step), _atom(atom),
-                        _src(src), _value(value), _entrance(entrance)
+                        size_t atom, Addr src, uint32_t value):
+                        _start(start), _end(end), _step(step),
+                        _atom(atom), _src(src), _value(value)
         {}
 
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
@@ -108,8 +107,6 @@ class PushEngine : public BaseMemoryEngine
 
         Addr src() { return _src; }
         uint32_t value() { return _value; }
-
-        Tick entrance() { return _entrance; }
     };
     struct PushInfo {
         Addr src;
@@ -126,7 +123,7 @@ class PushEngine : public BaseMemoryEngine
 
     int numPendingPulls;
     int edgePointerQueueSize;
-    std::deque<EdgeReadInfoGen> edgePointerQueue;
+    std::deque<std::tuple<EdgeReadInfoGen, Tick>> edgePointerQueue;
     std::unordered_map<RequestPtr, PushInfo> reqInfoMap;
 
     int onTheFlyMemReqs;

From 6413163e6f818ddc442e58c9302004c34bff1933 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 11 Oct 2022 15:54:40 -0700
Subject: [PATCH 191/247] Adding sssp and pr.

---
 src/accl/graph/sega/CoalesceEngine.py  |  2 ++
 src/accl/graph/sega/PushEngine.py      |  3 ++
 src/accl/graph/sega/coalesce_engine.cc | 29 ++++++++++---------
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/push_engine.cc     | 40 ++++++++++++++++++++++----
 src/accl/graph/sega/push_engine.hh     |  1 +
 src/accl/graph/sega/wl_engine.cc       |  8 +++++-
 7 files changed, 63 insertions(+), 21 deletions(-)

diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index f6e997f1e3..eeba279b7a 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -44,3 +44,5 @@ class CoalesceEngine(BaseMemoryEngine):
                                 "requestor in each cycle. Used to limit b/w.")
 
     workload = Param.String("BFS", "Name of the workload")
+
+    thereshold = Param.Float('0.0001', "Score threshold for Pagerank")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 5e0d2b3212..52dc0e2506 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -51,3 +51,6 @@ class PushEngine(BaseMemoryEngine):
                                     "for each update queue.")
 
     out_ports = VectorRequestPort("Outgoing ports to all MPUs")
+
+    alpha = Param.Float(0.8, "This parameter is specific to pagerank")
+    
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index f4cd6a950d..91072a1da8 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -48,7 +48,8 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
-    _workCount(0), numPullsReceived(0), workload(params.workload),
+    _workCount(0), numPullsReceived(0), 
+    workload(params.workload),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -110,16 +111,20 @@ CoalesceEngine::done()
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
 
-uint32_t
-CoalesceEngine::reduce(uint32_t update, uint32_t value)
+bool
+CoalesceEngine::applyCondition(uint32_t update, uint32_t value)
 {
-    uint32_t new_value;
     if(workload == "BFS"){
-        new_value = std::min(update, value);
+        return update != value;
+    } else if (workload == "SSSP"){
+        return  update < value;
+    } else if (workload == "PR"){
+        float float_value = writeToFloat<uint32_t>(value);
+        float float_update = writeToFloat<uint32_t>(update);
+        return  params().thereshold <= abs(float_update - float_value);
     } else{
-        panic("Workload not implemented\n");
+        panic("The workload is not recognize");
     }
-    return new_value;
 }
 
 // addr should be aligned to peerMemoryAtomSize
@@ -639,7 +644,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
 
-    if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) {
+    if (applyCondition(
+            wl.tempProp, cacheBlocks[block_index].items[wl_offset].tempProp)) {
         cacheBlocks[block_index].items[wl_offset] = wl;
         cacheBlocks[block_index].needsApply |= true;
         // NOTE: We don't set needsWB and rely on processNextApplyEvent to
@@ -747,12 +753,7 @@ CoalesceEngine::processNextApplyEvent()
         assert(cacheBlocks[block_index].busyMask == 0);
         for (int index = 0; index < numElementsPerLine; index++) {
             uint32_t current_prop = cacheBlocks[block_index].items[index].prop;
-            // NOTE: It might be the case that for workloads other than BFS,
-            // the reduce function here should be different to the reduce
-            // function defined in WLEngine. Think about the case of PR in
-            // detail.
-            uint32_t new_prop = reduce(
-                cacheBlocks[block_index].items[index].tempProp, current_prop);
+            uint32_t new_prop = cacheBlocks[block_index].items[index].tempProp;
             if (new_prop != current_prop) {
                 cacheBlocks[block_index].items[index].tempProp = new_prop;
                 cacheBlocks[block_index].items[index].prop = new_prop;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index b1f5b1fea1..a087f37b4d 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -131,6 +131,7 @@ class CoalesceEngine : public BaseMemoryEngine
 
     std::string workload;
     uint32_t reduce(uint32_t update, uint32_t value);
+    bool applyCondition(uint32_t update, uint32_t value);
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index cd795eaf00..c9efa03f08 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -158,6 +158,10 @@ PushEngine::reduce(uint32_t update, uint32_t value)
     uint32_t new_value;
     if(workload == "BFS"){
         new_value = std::min(update, value);
+    } else if(workload == "PR"){
+        new_value = update + value;
+    } else if(workload == "SSSP"){
+        new_value = std::min(update, value);
     } else{
         panic("Workload not implemented\n");
     }
@@ -165,19 +169,42 @@ PushEngine::reduce(uint32_t update, uint32_t value)
 }
 
 uint32_t
-PushEngine::propagate(uint32_t value, uint32_t weight)
+PushEngine::propagate(uint32_t delta, uint32_t weight)
 {
     std::string workload = params().workload;
     uint32_t update;
     if (workload == "BFS")  {
-        update = value + 1;
-    }
-    else{
+        update = delta + 1;
+    } else if (workload == "SSSP")  {
+        update = delta + weight;
+    } else if (workload == "PR")  {
+        float float_form = writeToFloat<uint32_t>(delta);
+        float float_update = float_form * weight * params().alpha;
+        update = readFromFloat<uint32_t>(float_update);
+    } else{
         panic("The workload %s is not supported", workload);
     }
     return update;
 }
 
+uint32_t
+PushEngine::calculateValue(WorkListItem wl)
+{
+    std::string workload = params().workload;
+    uint32_t delta;
+    if (workload == "PR")  {
+        float property = writeToFloat<uint32_t>(wl.prop) / wl.degree;
+        delta = readFromFloat<uint32_t>(property);
+    } else if (workload == "BFS") {
+        delta = wl.prop;
+    } else if (workload == "SSSP") {
+        delta = wl.prop;
+    } else {
+        panic("Workload not supported.");
+    }
+    return delta;
+}
+
 void
 PushEngine::start()
 {
@@ -220,9 +247,11 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
 
+    uint32_t value = calculateValue(wl);
     EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge),
-                            peerMemoryAtomSize, addr, (uint32_t) wl.prop);
+                            peerMemoryAtomSize, addr, value);
     edgePointerQueue.emplace_back(info_gen, curTick());
+    
     numPendingPulls--;
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
@@ -256,7 +285,6 @@ PushEngine::processNextMemoryReadEvent()
         PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
         PushInfo push_info = {curr_info.src(), curr_info.value(), offset, num_edges};
         reqInfoMap[pkt->req] = push_info;
-
         memPort.sendPacket(pkt);
         onTheFlyMemReqs += num_edges;
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index acf012b24d..c03e78851c 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -133,6 +133,7 @@ class PushEngine : public BaseMemoryEngine
 
     uint32_t reduce(uint32_t update, uint32_t value);
     uint32_t propagate(uint32_t value, uint32_t weight);
+    uint32_t calculateValue(WorkListItem wl);
 
     int updateQueueSize;
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index eb2006a3df..f684650f23 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -152,8 +152,14 @@ WLEngine::reduce(uint32_t update, uint32_t value)
     uint32_t new_value;
     if(workload == "BFS"){
         new_value = std::min(update, value);
+   } else if(workload == "PR"){
+        float float_value = writeToFloat<uint32_t>(value);
+        float float_update = writeToFloat<uint32_t>(update);
+        new_value = readFromFloat<uint32_t>(float_update + float_value);
+    } else if(workload == "SSSP"){
+        new_value = std::min(update, value);
     } else{
-        panic("Workload not implemented\n");
+        panic("Workload not implemented.");
     }
     return new_value;
 }

From bdb42750389d6e308a726f2d100bb5757895e034 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 11 Oct 2022 21:23:27 -0700
Subject: [PATCH 192/247] making workload appropriate inits

---
 src/accl/graph/sega/CenteralController.py  |  1 +
 src/accl/graph/sega/centeral_controller.cc | 17 +++++---
 src/accl/graph/sega/centeral_controller.hh |  1 +
 src/accl/graph/sega/coalesce_engine.cc     | 51 +++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh     |  2 +-
 5 files changed, 50 insertions(+), 22 deletions(-)

diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 2ba53c231f..ebc8281641 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -43,5 +43,6 @@ class CenteralController(ClockedObject):
 
     cxx_exports = [
                     PyBindMethod("createInitialBFSUpdate"),
+                    PyBindMethod("createInitialPRUpdate"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 82e63d512e..9231f96379 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -71,11 +71,8 @@ CenteralController::initState()
     [this](PacketPtr pkt) {
         for (auto mpu: mpuVector) {
             AddrRangeList range_list = addrRangeListMap[mpu];
-            for (auto range: range_list) {
-                if (range.contains(pkt->getAddr())) {
-                    mpu->recvFunctional(pkt);
-                    break;
-                }
+            if (contains(range_list, pkt->getAddr())) {
+                mpu->recvFunctional(pkt);
             }
         }
     }, system->cacheLineSize());
@@ -139,6 +136,16 @@ CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value)
     initialUpdates.push_back(update);
 }
 
+void
+CenteralController::createInitialPRUpdate()
+{
+    for (auto mpu: mpuVector) {
+        if (!mpu->running() && (mpu->workCount() > 0)) {
+            mpu->start();
+        }
+    }
+}
+
 void
 CenteralController::recvDoneSignal()
 {
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index d006851e3b..5b0f5d6816 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -64,6 +64,7 @@ class CenteralController : public ClockedObject
     virtual void startup() override;
 
     void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
+    void createInitialPRUpdate();
     void recvDoneSignal();
 
     void printAnswerToHostSimout();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 91072a1da8..92ad346b30 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -75,6 +75,40 @@ CoalesceEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
+void
+CoalesceEngine::algoInit(PacketPtr pkt)
+{
+    WorkListItem items[numElementsPerLine];
+    pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+    if(workload == "PR") {
+        //TODO: Add Alpha
+        int bit_index_base = getBitIndexBase(pkt->getAddr());
+        for (int i = 0; i < numElementsPerLine; i++) {
+            items[i].tempProp = readFromFloat<uint32_t>(1 - 0.2);
+            items[i].prop = readFromFloat<uint32_t>(1 - 0.2);
+            needsPush[bit_index_base + i] = 1;
+            activeBits.push_back(bit_index_base + i);
+        }
+    }
+    pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
+}
+
+bool
+CoalesceEngine::applyCondition(uint32_t update, uint32_t value)
+{
+    if(workload == "BFS"){
+        return update != value;
+    } else if (workload == "SSSP"){
+        return  update < value;
+    } else if (workload == "PR"){
+        float float_value = writeToFloat<uint32_t>(value);
+        float float_update = writeToFloat<uint32_t>(update);
+        return  params().thereshold <= abs(float_update - float_value);
+    } else{
+        panic("The workload is not recognize");
+    }
+}
+
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
 {
@@ -100,6 +134,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
             memPort.sendFunctional(pkt);
         }
     } else {
+        algoInit(pkt);
         memPort.sendFunctional(pkt);
     }
 }
@@ -111,22 +146,6 @@ CoalesceEngine::done()
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
 
-bool
-CoalesceEngine::applyCondition(uint32_t update, uint32_t value)
-{
-    if(workload == "BFS"){
-        return update != value;
-    } else if (workload == "SSSP"){
-        return  update < value;
-    } else if (workload == "PR"){
-        float float_value = writeToFloat<uint32_t>(value);
-        float float_update = writeToFloat<uint32_t>(update);
-        return  params().thereshold <= abs(float_update - float_value);
-    } else{
-        panic("The workload is not recognize");
-    }
-}
-
 // addr should be aligned to peerMemoryAtomSize
 int
 CoalesceEngine::getBlockIndex(Addr addr)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index a087f37b4d..49ee441ed3 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -130,7 +130,7 @@ class CoalesceEngine : public BaseMemoryEngine
     std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
     std::string workload;
-    uint32_t reduce(uint32_t update, uint32_t value);
+    void algoInit(PacketPtr pkt);
     bool applyCondition(uint32_t update, uint32_t value);
 
     MemoryEvent nextMemoryEvent;

From 5fa0c4c2376706e694afa3babbe2353baafd7440 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 12 Oct 2022 14:41:19 -0700
Subject: [PATCH 193/247] wip for implementing prewB and prePush apply
 functions.

---
 src/accl/graph/sega/CoalesceEngine.py  |  7 ++-
 src/accl/graph/sega/WLEngine.py        |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 61 +++++++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh |  4 ++
 src/accl/graph/sega/mpu.hh             |  2 +
 src/accl/graph/sega/push_engine.hh     |  2 +
 6 files changed, 59 insertions(+), 19 deletions(-)

diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index eeba279b7a..a50a814e89 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -43,6 +43,11 @@ class CoalesceEngine(BaseMemoryEngine):
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
 
+    post_apply_wb_queue_size = Param.Int("Maximum number of pending wb after "
+                                "apply process for applications that require "
+                                "the apply process to happen exactly before "
+                                "pushing the edgePointer to the PushEngine.")
+
     workload = Param.String("BFS", "Name of the workload")
 
-    thereshold = Param.Float('0.0001', "Score threshold for Pagerank")
+    threshold = Param.Float(0.0001, "Score threshold for Pagerank")
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 91325ab53f..7fe392cc9e 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -45,4 +45,4 @@ class WLEngine(BaseReduceEngine):
                                     "many updates as this queueu has "
                                     "entries at the same time.")
 
-    workload = Param.String('BFS',"Name of the workload")
\ No newline at end of file
+    workload = Param.String("BFS","Name of the workload")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 92ad346b30..4e1fe79899 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -48,7 +48,8 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
-    _workCount(0), numPullsReceived(0), 
+    _workCount(0), numPullsReceived(0),
+    postApplyWBQueueSize(params.post_apply_wb_queue_size),
     workload(params.workload),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
@@ -67,6 +68,16 @@ CoalesceEngine::CoalesceEngine(const Params &params):
         cacheBlocks[i] = Block(numElementsPerLine);
     }
     needsPush.reset();
+
+    // TODO: Get rid of these booleans.
+    // applyBeforeWB = true;
+    // if (workload == "PR") {
+    //     applyBeforeWB = false;
+    // }
+    // applyBeforePush = false;
+    // if (workload == "PR") {
+    //     applyBeforePush = true;
+    // }
 }
 
 void
@@ -84,7 +95,7 @@ CoalesceEngine::algoInit(PacketPtr pkt)
         //TODO: Add Alpha
         int bit_index_base = getBitIndexBase(pkt->getAddr());
         for (int i = 0; i < numElementsPerLine; i++) {
-            items[i].tempProp = readFromFloat<uint32_t>(1 - 0.2);
+            items[i].tempProp = readFromFloat<uint32_t>(0);
             items[i].prop = readFromFloat<uint32_t>(1 - 0.2);
             needsPush[bit_index_base + i] = 1;
             activeBits.push_back(bit_index_base + i);
@@ -96,15 +107,15 @@ CoalesceEngine::algoInit(PacketPtr pkt)
 bool
 CoalesceEngine::applyCondition(uint32_t update, uint32_t value)
 {
-    if(workload == "BFS"){
+    if(workload == "BFS") {
         return update != value;
-    } else if (workload == "SSSP"){
+    } else if (workload == "SSSP") {
         return  update < value;
-    } else if (workload == "PR"){
+    } else if (workload == "PR") {
         float float_value = writeToFloat<uint32_t>(value);
         float float_update = writeToFloat<uint32_t>(update);
-        return  params().thereshold <= abs(float_update - float_value);
-    } else{
+        return  params().threshold <= abs(float_update - float_value);
+    } else {
         panic("The workload is not recognize");
     }
 }
@@ -663,14 +674,15 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
 
-    if (applyCondition(
-            wl.tempProp, cacheBlocks[block_index].items[wl_offset].tempProp)) {
-        cacheBlocks[block_index].items[wl_offset] = wl;
-        cacheBlocks[block_index].needsApply |= true;
-        // NOTE: We don't set needsWB and rely on processNextApplyEvent to
-        // set that bit.
+    if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
+        cacheBlocks[block_index].needsWB |= true;
         stats.numVertexWrites++;
     }
+    if (applyCondition(wl.tempProp,
+                        cacheBlocks[block_index].items[wl_offset].prop)) {
+        cacheBlocks[block_index].needsApply |= true;
+    }
+    cacheBlocks[block_index].items[wl_offset] = wl;
 
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
     cacheBlocks[block_index].lastChangedTick = curTick();
@@ -773,10 +785,13 @@ CoalesceEngine::processNextApplyEvent()
         for (int index = 0; index < numElementsPerLine; index++) {
             uint32_t current_prop = cacheBlocks[block_index].items[index].prop;
             uint32_t new_prop = cacheBlocks[block_index].items[index].tempProp;
-            if (new_prop != current_prop) {
-                cacheBlocks[block_index].items[index].tempProp = new_prop;
-                cacheBlocks[block_index].items[index].prop = new_prop;
-
+            if (applyCondition(new_prop, current_prop)) {
+                if (applyBeforeWB) {
+                    cacheBlocks[block_index].items[index].tempProp = new_prop;
+                    cacheBlocks[block_index].items[index].prop = new_prop;
+                }
+                // TODO: Implement this function
+                // bool do_push =  preWBApply(cacheBlocks[block_index].items[index]);
                 int bit_index_base =
                             getBitIndexBase(cacheBlocks[block_index].addr);
 
@@ -1046,6 +1061,18 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 
             needsPush[slice_base_index + wl_offset] = 0;
             _workCount--;
+
+            // TODO: Implement a function like this.
+            // uint32_t delta, bool do_wb = prePushApply(cacheBlocks[block_index].items[wl_offset]);
+            // TODO: After implementing the above function get rid of this bool
+            // if (applyBeforePush) {
+            //     cacheBlocks[block_index].items[wl_offset].prop =
+            //         cacheBlocks[block_index].items[wl_offset].tempProp;
+            // }
+            // TODO: Implement recvVertexPush2 in PushEngine.
+            // owner->recvVertexPush2(vertex_addr, delta,
+            //             cacheBlocks[block_index].items[wl_offset].edgeIndex,
+            //             cacheBlocks[block_index].items[wl_offset].degree);
             owner->recvVertexPush(
                     vertex_addr, cacheBlocks[block_index].items[wl_offset]);
             stats.verticesPushed++;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 49ee441ed3..c9564ac187 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -114,11 +114,15 @@ class CoalesceEngine : public BaseMemoryEngine
     int maxRespPerCycle;
     std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
+    bool applyBeforeWB;
+    bool applyBeforePush;
     int _workCount;
     int numPullsReceived;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
     std::deque<int> activeBits;
+    int postApplyWBQueueSize;
+    std::deque<WorkListItem> postApplyWBQueue;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 229bd28950..9dcb9de5d7 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -75,6 +75,8 @@ class MPU : public SimObject
     bool running() { return pushEngine->running(); }
     void start() { return pushEngine->start(); }
     void recvVertexPush(Addr addr, WorkListItem wl);
+    void recvVertexPush2(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
 
     void recvDoneSignal();
     bool done();
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index c03e78851c..ec0dd09e43 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -199,6 +199,8 @@ class PushEngine : public BaseMemoryEngine
     void start();
     bool running() { return _running; }
     void recvVertexPush(Addr addr, WorkListItem wl);
+    void recvVertexPush2(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
 
     void recvReqRetry();
 

From 2e1719a6537238b64337472dd0b5b741b07bc0c3 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 14 Oct 2022 16:24:09 -0700
Subject: [PATCH 194/247] Adding GraphWorkload class.

---
 configs/accl/sega-hbm.py                   |   7 +-
 src/accl/graph/base/SConscript             |   1 +
 src/accl/graph/base/data_structs.hh        |   3 +-
 src/accl/graph/base/graph_workload.cc      |  66 ++++++++++++
 src/accl/graph/base/graph_workload.hh      |  74 +++++++++++++
 src/accl/graph/sega/CenteralController.py  |   1 +
 src/accl/graph/sega/centeral_controller.cc |  10 ++
 src/accl/graph/sega/centeral_controller.hh |   4 +
 src/accl/graph/sega/coalesce_engine.cc     |  76 +++++++-------
 src/accl/graph/sega/coalesce_engine.hh     |   9 +-
 src/accl/graph/sega/mpu.cc                 |   8 ++
 src/accl/graph/sega/mpu.hh                 |   1 +
 src/accl/graph/sega/push_engine.cc         | 115 +++++++++++----------
 src/accl/graph/sega/push_engine.hh         |   5 +-
 src/accl/graph/sega/wl_engine.cc           |  39 ++++---
 src/accl/graph/sega/wl_engine.hh           |   5 +-
 16 files changed, 302 insertions(+), 122 deletions(-)
 create mode 100644 src/accl/graph/base/graph_workload.cc
 create mode 100644 src/accl/graph/base/graph_workload.hh

diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
index cdc752f2bd..50fd5f3069 100644
--- a/configs/accl/sega-hbm.py
+++ b/configs/accl/sega-hbm.py
@@ -56,7 +56,8 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                             cache_size=cache_size,
                                             num_mshr_entry=64,
                                             num_tgts_per_mshr=64,
-                                            max_resp_per_cycle=8
+                                            max_resp_per_cycle=8,
+                                            post_apply_wb_queue_size=64
                                             )
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
@@ -135,6 +136,9 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
     def create_initial_bfs_update(self, init_addr, init_value):
         self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+        
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.createBFSWorkload(init_addr, init_value)
 
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
@@ -166,6 +170,7 @@ def get_inputs():
     m5.instantiate()
 
     system.create_initial_bfs_update(init_addr, init_value)
+    system.create_bfs_workload(init_addr, init_value)
     exit_event = m5.simulate()
     print(f"Exited simulation at tick {m5.curTick()} " + \
             f"because {exit_event.getCause()}")
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 8b741abfc8..35111c34d2 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -30,3 +30,4 @@ Import("*")
 SimObject("BaseReduceEngine.py", sim_objects=["BaseReduceEngine"])
 
 Source("base_reduce_engine.cc")
+Source("graph_workload.cc")
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 3753e10d62..2d81375b63 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -32,9 +32,10 @@
 #include "base/cprintf.hh"
 #include "base/intmath.hh"
 
-#include <list>
+#include <algorithm>
 #include <cassert>
 #include <cstring>
+#include <list>
 
 namespace gem5
 {
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
new file mode 100644
index 0000000000..3d0d45b1de
--- /dev/null
+++ b/src/accl/graph/base/graph_workload.cc
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/base/graph_workload.hh"
+
+namespace gem5 
+{
+
+uint32_t 
+BFSWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return std::min(update, value);
+}
+
+uint32_t
+BFSWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value + 1;
+}
+
+bool 
+BFSWorkload::applyCondition(WorkListItem wl)
+{
+    return wl.tempProp < wl.prop;
+}
+
+bool
+BFSWorkload::preWBApply(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    return wl.degree > 0;
+}
+
+std::tuple<uint32_t, bool> 
+BFSWorkload::prePushApply(WorkListItem& wl)
+{
+    uint32_t value = wl.prop;
+    return std::make_tuple(value, false);
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
new file mode 100644
index 0000000000..304b434a3d
--- /dev/null
+++ b/src/accl/graph/base/graph_workload.hh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
+#define  __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
+
+#include <tuple>
+
+#include "accl/graph/base/data_structs.hh"
+
+
+namespace gem5
+{
+
+class GraphWorkload
+{
+  public:
+    GraphWorkload() {}
+    ~GraphWorkload() {}
+    virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
+    virtual bool applyCondition(WorkListItem wl) = 0;
+    virtual bool preWBApply(WorkListItem& wl) = 0;
+    virtual std::tuple<uint32_t, bool> prePushApply(WorkListItem& wl) = 0;
+};
+
+class BFSWorkload : public GraphWorkload
+{
+  private:
+    uint64_t initAddr;
+    uint32_t initValue;
+  public:
+    BFSWorkload(uint64_t init_addr, uint32_t init_value):
+        GraphWorkload(), 
+        initAddr(init_addr), initValue(init_value)
+    {}
+
+    ~BFSWorkload() {}
+
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual bool applyCondition(WorkListItem wl);
+    virtual bool preWBApply(WorkListItem& wl);
+    virtual std::tuple<uint32_t, bool> prePushApply(WorkListItem& wl);
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index ebc8281641..17badf9ec4 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -43,6 +43,7 @@ class CenteralController(ClockedObject):
 
     cxx_exports = [
                     PyBindMethod("createInitialBFSUpdate"),
+                    PyBindMethod("createBFSWorkload"),
                     PyBindMethod("createInitialPRUpdate"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 9231f96379..2074f69f08 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -83,6 +83,10 @@ CenteralController::initState()
 void
 CenteralController::startup()
 {
+    for (auto mpu: mpuVector) {
+        mpu->recvWorkload(workload);
+    }
+
     while(!initialUpdates.empty()) {
         PacketPtr front = initialUpdates.front();
         for (auto mpu: mpuVector) {
@@ -136,6 +140,12 @@ CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value)
     initialUpdates.push_back(update);
 }
 
+void
+CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BFSWorkload(init_addr, init_value);
+}
+
 void
 CenteralController::createInitialPRUpdate()
 {
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 5b0f5d6816..1f1df00b4b 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -32,6 +32,7 @@
 #include <vector>
 
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "base/addr_range.hh"
 #include "debug/FinalAnswer.hh"
@@ -47,6 +48,8 @@ class CenteralController : public ClockedObject
   private:
     System* system;
 
+    GraphWorkload* workload;
+
     Addr maxVertexAddr;
     std::deque<PacketPtr> initialUpdates;
 
@@ -64,6 +67,7 @@ class CenteralController : public ClockedObject
     virtual void startup() override;
 
     void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
+    void createBFSWorkload(Addr init_addr, uint32_t init_value);
     void createInitialPRUpdate();
     void recvDoneSignal();
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 4e1fe79899..20bfaf8481 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -68,16 +68,6 @@ CoalesceEngine::CoalesceEngine(const Params &params):
         cacheBlocks[i] = Block(numElementsPerLine);
     }
     needsPush.reset();
-
-    // TODO: Get rid of these booleans.
-    // applyBeforeWB = true;
-    // if (workload == "PR") {
-    //     applyBeforeWB = false;
-    // }
-    // applyBeforePush = false;
-    // if (workload == "PR") {
-    //     applyBeforePush = true;
-    // }
 }
 
 void
@@ -90,9 +80,10 @@ void
 CoalesceEngine::algoInit(PacketPtr pkt)
 {
     WorkListItem items[numElementsPerLine];
-    pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+    
     if(workload == "PR") {
         //TODO: Add Alpha
+        pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
         int bit_index_base = getBitIndexBase(pkt->getAddr());
         for (int i = 0; i < numElementsPerLine; i++) {
             items[i].tempProp = readFromFloat<uint32_t>(0);
@@ -100,25 +91,39 @@ CoalesceEngine::algoInit(PacketPtr pkt)
             needsPush[bit_index_base + i] = 1;
             activeBits.push_back(bit_index_base + i);
         }
+        pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
     }
-    pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
+    
 }
 
-bool
-CoalesceEngine::applyCondition(uint32_t update, uint32_t value)
-{
-    if(workload == "BFS") {
-        return update != value;
-    } else if (workload == "SSSP") {
-        return  update < value;
-    } else if (workload == "PR") {
-        float float_value = writeToFloat<uint32_t>(value);
-        float float_update = writeToFloat<uint32_t>(update);
-        return  params().threshold <= abs(float_update - float_value);
-    } else {
-        panic("The workload is not recognize");
-    }
-}
+// bool
+// CoalesceEngine::applyCondition(WorkListItem wl)
+// {
+//     if (workload == "BFS") {
+//         return wl.tempProp != wl.prop;
+//     } else if (workload == "SSSP") {
+//         return  wl.tempProp < wl.prop;
+//     } else if (workload == "PR") {
+//         float float_temp = writeToFloat<uint32_t>(wl.tempProp);
+//         float float_prop = writeToFloat<uint32_t>(wl.prop);
+//         return  params().threshold <= abs(float_prop - float_temp);
+//     } else {
+//         panic("The workload is not recognized.");
+//     }
+// }
+
+// bool
+// CoalesceEngine::preWBApply(WorkListItem& wl)
+// {
+//     if (workload == "BFS") {
+//         uint32_t new_prop = std::min(wl.tempProp, wl.prop);
+//         wl.tempProp = new_prop;
+//         wl.prop = new_prop;
+//         return wl.degree > 0;  
+//     } else {
+//         panic("The workload is not recognized.");
+//     }
+// }
 
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
@@ -678,11 +683,10 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
         cacheBlocks[block_index].needsWB |= true;
         stats.numVertexWrites++;
     }
-    if (applyCondition(wl.tempProp,
-                        cacheBlocks[block_index].items[wl_offset].prop)) {
+    cacheBlocks[block_index].items[wl_offset] = wl;
+    if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) {
         cacheBlocks[block_index].needsApply |= true;
     }
-    cacheBlocks[block_index].items[wl_offset] = wl;
 
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
     cacheBlocks[block_index].lastChangedTick = curTick();
@@ -783,19 +787,13 @@ CoalesceEngine::processNextApplyEvent()
     if (cacheBlocks[block_index].pendingApply) {
         assert(cacheBlocks[block_index].busyMask == 0);
         for (int index = 0; index < numElementsPerLine; index++) {
-            uint32_t current_prop = cacheBlocks[block_index].items[index].prop;
-            uint32_t new_prop = cacheBlocks[block_index].items[index].tempProp;
-            if (applyCondition(new_prop, current_prop)) {
-                if (applyBeforeWB) {
-                    cacheBlocks[block_index].items[index].tempProp = new_prop;
-                    cacheBlocks[block_index].items[index].prop = new_prop;
-                }
+            if (graphWorkload->applyCondition(cacheBlocks[block_index].items[index])) {
                 // TODO: Implement this function
-                // bool do_push =  preWBApply(cacheBlocks[block_index].items[index]);
+                bool do_push =  graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
                 int bit_index_base =
                             getBitIndexBase(cacheBlocks[block_index].addr);
 
-                if (cacheBlocks[block_index].items[index].degree > 0) {
+                if (do_push) {
                     if (needsPush[bit_index_base + index] == 0) {
                         _workCount++;
                         needsPush[bit_index_base + index] = 1;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c9564ac187..3492cab9dc 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -31,8 +31,9 @@
 
 #include <bitset>
 
-#include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
 #include "base/cprintf.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
@@ -134,8 +135,11 @@ class CoalesceEngine : public BaseMemoryEngine
     std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
     std::string workload;
+    GraphWorkload* graphWorkload;
+
     void algoInit(PacketPtr pkt);
-    bool applyCondition(uint32_t update, uint32_t value);
+    bool applyCondition(WorkListItem wl);
+    bool preWBApply(WorkListItem& wl);
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
@@ -203,6 +207,7 @@ class CoalesceEngine : public BaseMemoryEngine
     CoalesceEngine(const Params &params);
     void registerMPU(MPU* mpu);
 
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
     virtual void recvFunctional(PacketPtr pkt);
 
     bool recvWLRead(Addr addr);
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 44054d1efb..70f1e05f32 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -71,6 +71,14 @@ MPU::recvWLWrite(Addr addr, WorkListItem wl)
     coalesceEngine->recvWLWrite(addr, wl);
 }
 
+void
+MPU::recvWorkload(GraphWorkload* workload)
+{
+    coalesceEngine->recvWorkload(workload);
+    pushEngine->recvWorkload(workload);
+    wlEngine->recvWorkload(workload);
+}
+
 void
 MPU::recvVertexPush(Addr addr, WorkListItem wl)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 9dcb9de5d7..8f6101c325 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -69,6 +69,7 @@ class MPU : public SimObject
     void handleIncomingWL(Addr addr, WorkListItem wl);
     bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
+    void recvWorkload(GraphWorkload* Workload);
 
     int workCount() { return coalesceEngine->workCount(); }
     void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c9efa03f08..a661a755b7 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -151,59 +151,59 @@ PushEngine::done()
         (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
 }
 
-uint32_t
-PushEngine::reduce(uint32_t update, uint32_t value)
-{
-    std::string workload = params().workload;
-    uint32_t new_value;
-    if(workload == "BFS"){
-        new_value = std::min(update, value);
-    } else if(workload == "PR"){
-        new_value = update + value;
-    } else if(workload == "SSSP"){
-        new_value = std::min(update, value);
-    } else{
-        panic("Workload not implemented\n");
-    }
-    return new_value;
-}
-
-uint32_t
-PushEngine::propagate(uint32_t delta, uint32_t weight)
-{
-    std::string workload = params().workload;
-    uint32_t update;
-    if (workload == "BFS")  {
-        update = delta + 1;
-    } else if (workload == "SSSP")  {
-        update = delta + weight;
-    } else if (workload == "PR")  {
-        float float_form = writeToFloat<uint32_t>(delta);
-        float float_update = float_form * weight * params().alpha;
-        update = readFromFloat<uint32_t>(float_update);
-    } else{
-        panic("The workload %s is not supported", workload);
-    }
-    return update;
-}
-
-uint32_t
-PushEngine::calculateValue(WorkListItem wl)
-{
-    std::string workload = params().workload;
-    uint32_t delta;
-    if (workload == "PR")  {
-        float property = writeToFloat<uint32_t>(wl.prop) / wl.degree;
-        delta = readFromFloat<uint32_t>(property);
-    } else if (workload == "BFS") {
-        delta = wl.prop;
-    } else if (workload == "SSSP") {
-        delta = wl.prop;
-    } else {
-        panic("Workload not supported.");
-    }
-    return delta;
-}
+// uint32_t
+// PushEngine::reduce(uint32_t update, uint32_t value)
+// {
+//     std::string workload = params().workload;
+//     uint32_t new_value;
+//     if(workload == "BFS"){
+//         new_value = std::min(update, value);
+//     } else if(workload == "PR"){
+//         new_value = update + value;
+//     } else if(workload == "SSSP"){
+//         new_value = std::min(update, value);
+//     } else{
+//         panic("Workload not implemented\n");
+//     }
+//     return new_value;
+// }
+
+// uint32_t
+// PushEngine::propagate(uint32_t delta, uint32_t weight)
+// {
+//     std::string workload = params().workload;
+//     uint32_t update;
+//     if (workload == "BFS")  {
+//         update = delta + 1;
+//     } else if (workload == "SSSP")  {
+//         update = delta + weight;
+//     } else if (workload == "PR")  {
+//         float float_form = writeToFloat<uint32_t>(delta);
+//         float float_update = float_form * weight * params().alpha;
+//         update = readFromFloat<uint32_t>(float_update);
+//     } else{
+//         panic("The workload %s is not supported", workload);
+//     }
+//     return update;
+// }
+
+// uint32_t
+// PushEngine::calculateValue(WorkListItem wl)
+// {
+//     std::string workload = params().workload;
+//     uint32_t delta;
+//     if (workload == "PR")  {
+//         float property = writeToFloat<uint32_t>(wl.prop) / wl.degree;
+//         delta = readFromFloat<uint32_t>(property);
+//     } else if (workload == "BFS") {
+//         delta = wl.prop;
+//     } else if (workload == "SSSP") {
+//         delta = wl.prop;
+//     } else {
+//         panic("Workload not supported.");
+//     }
+//     return delta;
+// }
 
 void
 PushEngine::start()
@@ -247,9 +247,9 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
 
-    uint32_t value = calculateValue(wl);
+    // uint32_t value = calculateValue(wl);
     EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge),
-                            peerMemoryAtomSize, addr, value);
+                            peerMemoryAtomSize, addr, wl.prop);
     edgePointerQueue.emplace_back(info_gen, curTick());
     
     numPendingPulls--;
@@ -364,7 +364,8 @@ PushEngine::processNextPropagateEvent()
         DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
                                 __func__, meta_edge.to_string());
 
-        uint32_t update_value = propagate(meta_edge.value, meta_edge.weight);
+        uint32_t update_value = 
+                graphWorkload->propagate(meta_edge.value, meta_edge.weight);
         Update update(meta_edge.src, meta_edge.dst, update_value);
         metaEdgeQueue.pop_front();
 
@@ -419,7 +420,7 @@ PushEngine::enqueueUpdate(Update update)
                 Update& curr_update = std::get<0>(entry);
                 if (curr_update.dst == update.dst) {
                     uint32_t old_value = curr_update.value;
-                    curr_update.value = reduce(old_value, update.value);
+                    curr_update.value = graphWorkload->reduce(old_value, update.value);
                     DPRINTF(PushEngine, "%s: found a coalescing opportunity "
                             "for destination %d with new value: %d by "
                             "coalescing %d and %d. \n", __func__, update.dst,
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index ec0dd09e43..47db96d818 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -29,8 +29,9 @@
 #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
-#include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
 #include "base/intmath.hh"
 #include "params/PushEngine.hh"
 
@@ -115,6 +116,7 @@ class PushEngine : public BaseMemoryEngine
         int numElements;
     };
     MPU* owner;
+    GraphWorkload* graphWorkload;
 
     bool _running;
     Tick lastIdleEntranceTick;
@@ -194,6 +196,7 @@ class PushEngine : public BaseMemoryEngine
     virtual void init() override;
     void registerMPU(MPU* mpu);
 
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
     virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
 
     void start();
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index f684650f23..86acd40b69 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -41,7 +41,6 @@ WLEngine::WLEngine(const WLEngineParams& params):
     BaseReduceEngine(params),
     updateQueueSize(params.update_queue_size),
     registerFileSize(params.register_file_size),
-    workload(params.workload),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)
@@ -146,23 +145,23 @@ WLEngine::done()
     return registerFile.empty() && updateQueue.empty();
 }
 
-uint32_t
-WLEngine::reduce(uint32_t update, uint32_t value)
-{
-    uint32_t new_value;
-    if(workload == "BFS"){
-        new_value = std::min(update, value);
-   } else if(workload == "PR"){
-        float float_value = writeToFloat<uint32_t>(value);
-        float float_update = writeToFloat<uint32_t>(update);
-        new_value = readFromFloat<uint32_t>(float_update + float_value);
-    } else if(workload == "SSSP"){
-        new_value = std::min(update, value);
-    } else{
-        panic("Workload not implemented.");
-    }
-    return new_value;
-}
+// uint32_t
+// WLEngine::reduce(uint32_t update, uint32_t value)
+// {
+//     uint32_t new_value;
+//     if(workload == "BFS"){
+//         new_value = std::min(update, value);
+//    } else if(workload == "PR"){
+//         float float_value = writeToFloat<uint32_t>(value);
+//         float float_update = writeToFloat<uint32_t>(update);
+//         new_value = readFromFloat<uint32_t>(float_update + float_value);
+//     } else if(workload == "SSSP"){
+//         new_value = std::min(update, value);
+//     } else{
+//         panic("Workload not implemented.");
+//     }
+//     return new_value;
+// }
 
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
@@ -251,7 +250,7 @@ WLEngine::processNextReadEvent()
                     "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
                 __func__, update_addr, update_addr, registerFile[update_addr]);
         registerFile[update_addr] =
-                    reduce(update_value, registerFile[update_addr]);
+                graphWorkload->reduce(update_value, registerFile[update_addr]);
         DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
                     " registerFile. registerFile[%lu] = %u.\n", __func__,
                     update_value, update_addr, registerFile[update_addr]);
@@ -310,7 +309,7 @@ WLEngine::processNextReduceEvent()
                                         addr, workListFile[addr].to_string());
         // TODO: Generalize this to reduce function rather than just min
         workListFile[addr].tempProp =
-                    reduce(update_value, workListFile[addr].tempProp);
+            graphWorkload->reduce(update_value, workListFile[addr].tempProp);
         DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
                             __func__, addr, workListFile[addr].to_string());
         stats.numReduce++;
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 7578044cbf..0d0e532269 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -33,6 +33,7 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_reduce_engine.hh"
+#include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "base/statistics.hh"
 #include "params/WLEngine.hh"
@@ -70,7 +71,8 @@ class WLEngine : public BaseReduceEngine
     };
 
     MPU* owner;
-
+    GraphWorkload* graphWorkload;
+    
     std::vector<RespPort> inPorts;
 
     int updateQueueSize;
@@ -118,6 +120,7 @@ class WLEngine : public BaseReduceEngine
     void registerMPU(MPU* mpu);
 
     AddrRangeList getAddrRanges();
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
     void recvFunctional(PacketPtr pkt);
 
     bool handleIncomingUpdate(PacketPtr pkt);

From fba3e575719072c9dec328df5c6f0603bb9d7c6f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 15 Oct 2022 16:59:05 -0700
Subject: [PATCH 195/247] Cleaning up.

---
 src/accl/graph/sega/CoalesceEngine.py      |  3 -
 src/accl/graph/sega/PushEngine.py          |  7 +--
 src/accl/graph/sega/WLEngine.py            |  2 -
 src/accl/graph/sega/centeral_controller.cc |  5 +-
 src/accl/graph/sega/coalesce_engine.cc     | 64 ++++++----------------
 src/accl/graph/sega/coalesce_engine.hh     |  8 +--
 src/accl/graph/sega/push_engine.cc         | 58 +-------------------
 src/accl/graph/sega/push_engine.hh         |  4 --
 src/accl/graph/sega/wl_engine.cc           | 18 ------
 src/accl/graph/sega/wl_engine.hh           |  6 +-
 10 files changed, 23 insertions(+), 152 deletions(-)

diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index a50a814e89..d462d618e6 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -48,6 +48,3 @@ class CoalesceEngine(BaseMemoryEngine):
                                 "the apply process to happen exactly before "
                                 "pushing the edgePointer to the PushEngine.")
 
-    workload = Param.String("BFS", "Name of the workload")
-
-    threshold = Param.Float(0.0001, "Score threshold for Pagerank")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 52dc0e2506..20c5452d43 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -34,8 +34,6 @@ class PushEngine(BaseMemoryEngine):
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
-    workload = Param.String("BFS", "Name of the workload.")
-
     push_req_queue_size = Param.Int("Size of the queue to "
                                     "queue push requests.")
     # resp_queue_size should probably be
@@ -43,7 +41,7 @@ class PushEngine(BaseMemoryEngine):
     resp_queue_size = Param.Int("Size of the response queue in the "
                                     "push engine where it stores the "
                                     "edges read from memory.")
-    
+
     max_propagates_per_cycle = Param.Int(4, "Maximum number of propagates "
                                             "done per cycle.")
 
@@ -51,6 +49,3 @@ class PushEngine(BaseMemoryEngine):
                                     "for each update queue.")
 
     out_ports = VectorRequestPort("Outgoing ports to all MPUs")
-
-    alpha = Param.Float(0.8, "This parameter is specific to pagerank")
-    
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 7fe392cc9e..5a8ed9c9fd 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -44,5 +44,3 @@ class WLEngine(BaseReduceEngine):
                                     "WLEngine has. It can service as "
                                     "many updates as this queueu has "
                                     "entries at the same time.")
-
-    workload = Param.String("BFS","Name of the workload")
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 2074f69f08..fd282834e9 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -55,6 +55,7 @@ CenteralController::initState()
 {
     for (auto mpu: mpuVector) {
         addrRangeListMap[mpu] = mpu->getAddrRanges();
+        mpu->recvWorkload(workload);
     }
     const auto& file = params().image_file;
     if (file == "")
@@ -83,10 +84,6 @@ CenteralController::initState()
 void
 CenteralController::startup()
 {
-    for (auto mpu: mpuVector) {
-        mpu->recvWorkload(workload);
-    }
-
     while(!initialUpdates.empty()) {
         PacketPtr front = initialUpdates.front();
         for (auto mpu: mpuVector) {
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 20bfaf8481..fa5099353e 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -50,7 +50,6 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     maxRespPerCycle(params.max_resp_per_cycle),
     _workCount(0), numPullsReceived(0),
     postApplyWBQueueSize(params.post_apply_wb_queue_size),
-    workload(params.workload),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -76,52 +75,22 @@ CoalesceEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
-void
-CoalesceEngine::algoInit(PacketPtr pkt)
-{
-    WorkListItem items[numElementsPerLine];
-    
-    if(workload == "PR") {
-        //TODO: Add Alpha
-        pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
-        int bit_index_base = getBitIndexBase(pkt->getAddr());
-        for (int i = 0; i < numElementsPerLine; i++) {
-            items[i].tempProp = readFromFloat<uint32_t>(0);
-            items[i].prop = readFromFloat<uint32_t>(1 - 0.2);
-            needsPush[bit_index_base + i] = 1;
-            activeBits.push_back(bit_index_base + i);
-        }
-        pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
-    }
-    
-}
-
-// bool
-// CoalesceEngine::applyCondition(WorkListItem wl)
-// {
-//     if (workload == "BFS") {
-//         return wl.tempProp != wl.prop;
-//     } else if (workload == "SSSP") {
-//         return  wl.tempProp < wl.prop;
-//     } else if (workload == "PR") {
-//         float float_temp = writeToFloat<uint32_t>(wl.tempProp);
-//         float float_prop = writeToFloat<uint32_t>(wl.prop);
-//         return  params().threshold <= abs(float_prop - float_temp);
-//     } else {
-//         panic("The workload is not recognized.");
-//     }
-// }
-
-// bool
-// CoalesceEngine::preWBApply(WorkListItem& wl)
+// void
+// CoalesceEngine::algoInit(PacketPtr pkt)
 // {
-//     if (workload == "BFS") {
-//         uint32_t new_prop = std::min(wl.tempProp, wl.prop);
-//         wl.tempProp = new_prop;
-//         wl.prop = new_prop;
-//         return wl.degree > 0;  
-//     } else {
-//         panic("The workload is not recognized.");
+//     WorkListItem items[numElementsPerLine];
+
+//     if(workload == "PR") {
+//         //TODO: Add Alpha
+//         pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+//         int bit_index_base = getBitIndexBase(pkt->getAddr());
+//         for (int i = 0; i < numElementsPerLine; i++) {
+//             items[i].tempProp = readFromFloat<uint32_t>(0);
+//             items[i].prop = readFromFloat<uint32_t>(1 - 0.2);
+//             needsPush[bit_index_base + i] = 1;
+//             activeBits.push_back(bit_index_base + i);
+//         }
+//         pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
 //     }
 // }
 
@@ -150,7 +119,8 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
             memPort.sendFunctional(pkt);
         }
     } else {
-        algoInit(pkt);
+        // TODO: Add and implement init function for GraphWorkload.
+        // graphWorkload->init(pkt);
         memPort.sendFunctional(pkt);
     }
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 3492cab9dc..0a2c0ca5ff 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -103,6 +103,7 @@ class CoalesceEngine : public BaseMemoryEngine
       SenderState(bool is_retry): isRetry(is_retry) {}
     };
     MPU* owner;
+    GraphWorkload* graphWorkload;
 
     int numLines;
     int numElementsPerLine;
@@ -134,13 +135,6 @@ class CoalesceEngine : public BaseMemoryEngine
     // send for push when getting the read response from memory.
     std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
-    std::string workload;
-    GraphWorkload* graphWorkload;
-
-    void algoInit(PacketPtr pkt);
-    bool applyCondition(WorkListItem wl);
-    bool preWBApply(WorkListItem& wl);
-
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
     void processNextRead(int block_index, Tick schedule_tick);
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a661a755b7..c54f19307f 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -151,60 +151,6 @@ PushEngine::done()
         (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
 }
 
-// uint32_t
-// PushEngine::reduce(uint32_t update, uint32_t value)
-// {
-//     std::string workload = params().workload;
-//     uint32_t new_value;
-//     if(workload == "BFS"){
-//         new_value = std::min(update, value);
-//     } else if(workload == "PR"){
-//         new_value = update + value;
-//     } else if(workload == "SSSP"){
-//         new_value = std::min(update, value);
-//     } else{
-//         panic("Workload not implemented\n");
-//     }
-//     return new_value;
-// }
-
-// uint32_t
-// PushEngine::propagate(uint32_t delta, uint32_t weight)
-// {
-//     std::string workload = params().workload;
-//     uint32_t update;
-//     if (workload == "BFS")  {
-//         update = delta + 1;
-//     } else if (workload == "SSSP")  {
-//         update = delta + weight;
-//     } else if (workload == "PR")  {
-//         float float_form = writeToFloat<uint32_t>(delta);
-//         float float_update = float_form * weight * params().alpha;
-//         update = readFromFloat<uint32_t>(float_update);
-//     } else{
-//         panic("The workload %s is not supported", workload);
-//     }
-//     return update;
-// }
-
-// uint32_t
-// PushEngine::calculateValue(WorkListItem wl)
-// {
-//     std::string workload = params().workload;
-//     uint32_t delta;
-//     if (workload == "PR")  {
-//         float property = writeToFloat<uint32_t>(wl.prop) / wl.degree;
-//         delta = readFromFloat<uint32_t>(property);
-//     } else if (workload == "BFS") {
-//         delta = wl.prop;
-//     } else if (workload == "SSSP") {
-//         delta = wl.prop;
-//     } else {
-//         panic("Workload not supported.");
-//     }
-//     return delta;
-// }
-
 void
 PushEngine::start()
 {
@@ -251,7 +197,7 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge),
                             peerMemoryAtomSize, addr, wl.prop);
     edgePointerQueue.emplace_back(info_gen, curTick());
-    
+
     numPendingPulls--;
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
@@ -364,7 +310,7 @@ PushEngine::processNextPropagateEvent()
         DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
                                 __func__, meta_edge.to_string());
 
-        uint32_t update_value = 
+        uint32_t update_value =
                 graphWorkload->propagate(meta_edge.value, meta_edge.weight);
         Update update(meta_edge.src, meta_edge.dst, update_value);
         metaEdgeQueue.pop_front();
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 47db96d818..1112176897 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -133,10 +133,6 @@ class PushEngine : public BaseMemoryEngine
     int maxPropagatesPerCycle;
     std::deque<std::tuple<MetaEdge, Tick>> metaEdgeQueue;
 
-    uint32_t reduce(uint32_t update, uint32_t value);
-    uint32_t propagate(uint32_t value, uint32_t weight);
-    uint32_t calculateValue(WorkListItem wl);
-
     int updateQueueSize;
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
     bool enqueueUpdate(Update update);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 86acd40b69..85fe9be2ca 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -145,24 +145,6 @@ WLEngine::done()
     return registerFile.empty() && updateQueue.empty();
 }
 
-// uint32_t
-// WLEngine::reduce(uint32_t update, uint32_t value)
-// {
-//     uint32_t new_value;
-//     if(workload == "BFS"){
-//         new_value = std::min(update, value);
-//    } else if(workload == "PR"){
-//         float float_value = writeToFloat<uint32_t>(value);
-//         float float_update = writeToFloat<uint32_t>(update);
-//         new_value = readFromFloat<uint32_t>(float_update + float_value);
-//     } else if(workload == "SSSP"){
-//         new_value = std::min(update, value);
-//     } else{
-//         panic("Workload not implemented.");
-//     }
-//     return new_value;
-// }
-
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 0d0e532269..f442d6060e 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -72,7 +72,7 @@ class WLEngine : public BaseReduceEngine
 
     MPU* owner;
     GraphWorkload* graphWorkload;
-    
+
     std::vector<RespPort> inPorts;
 
     int updateQueueSize;
@@ -81,12 +81,8 @@ class WLEngine : public BaseReduceEngine
     int registerFileSize;
     std::unordered_map<Addr, uint32_t> registerFile;
     std::unordered_map<Addr, Tick> vertexReadTime;
-
     std::unordered_map<Addr, WorkListItem> workListFile;
 
-    std::string workload;
-    uint32_t reduce(uint32_t update, uint32_t value);
-
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 

From 01ab8f8809451179d27f3f5da7be57675161f4e7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 16 Oct 2022 17:05:07 -0700
Subject: [PATCH 196/247] Implementing post push wb buffer.

---
 src/accl/graph/base/graph_workload.cc  |  19 +-
 src/accl/graph/base/graph_workload.hh  |   6 +-
 src/accl/graph/sega/CoalesceEngine.py  |   2 +-
 src/accl/graph/sega/coalesce_engine.cc | 239 +++++++++++++++++--------
 src/accl/graph/sega/coalesce_engine.hh |  10 +-
 src/accl/graph/sega/mpu.cc             |  12 +-
 src/accl/graph/sega/mpu.hh             |   4 +-
 src/accl/graph/sega/push_engine.cc     |  17 +-
 src/accl/graph/sega/push_engine.hh     |  23 ++-
 9 files changed, 223 insertions(+), 109 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 3d0d45b1de..6a8e000515 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -28,10 +28,10 @@
 
 #include "accl/graph/base/graph_workload.hh"
 
-namespace gem5 
+namespace gem5
 {
 
-uint32_t 
+uint32_t
 BFSWorkload::reduce(uint32_t update, uint32_t value)
 {
     return std::min(update, value);
@@ -43,7 +43,7 @@ BFSWorkload::propagate(uint32_t value, uint32_t weight)
     return value + 1;
 }
 
-bool 
+bool
 BFSWorkload::applyCondition(WorkListItem wl)
 {
     return wl.tempProp < wl.prop;
@@ -52,15 +52,20 @@ BFSWorkload::applyCondition(WorkListItem wl)
 bool
 BFSWorkload::preWBApply(WorkListItem& wl)
 {
-    wl.prop = wl.tempProp;
-    return wl.degree > 0;
+    if (applyCondition(wl)) {
+        wl.prop = wl.tempProp;
+        if (wl.degree > 0) {
+            return true;
+        }
+    }
+    return false;
 }
 
-std::tuple<uint32_t, bool> 
+std::tuple<uint32_t, bool, bool>
 BFSWorkload::prePushApply(WorkListItem& wl)
 {
     uint32_t value = wl.prop;
-    return std::make_tuple(value, false);
+    return std::make_tuple(value, true, false);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 304b434a3d..c4db5c9e2f 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -46,7 +46,7 @@ class GraphWorkload
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual bool applyCondition(WorkListItem wl) = 0;
     virtual bool preWBApply(WorkListItem& wl) = 0;
-    virtual std::tuple<uint32_t, bool> prePushApply(WorkListItem& wl) = 0;
+    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl) = 0;
 };
 
 class BFSWorkload : public GraphWorkload
@@ -56,7 +56,7 @@ class BFSWorkload : public GraphWorkload
     uint32_t initValue;
   public:
     BFSWorkload(uint64_t init_addr, uint32_t init_value):
-        GraphWorkload(), 
+        GraphWorkload(),
         initAddr(init_addr), initValue(init_value)
     {}
 
@@ -66,7 +66,7 @@ class BFSWorkload : public GraphWorkload
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual bool applyCondition(WorkListItem wl);
     virtual bool preWBApply(WorkListItem& wl);
-    virtual std::tuple<uint32_t, bool> prePushApply(WorkListItem& wl);
+    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
 };
 
 }
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index d462d618e6..1fd3b968c5 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -43,7 +43,7 @@ class CoalesceEngine(BaseMemoryEngine):
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
 
-    post_apply_wb_queue_size = Param.Int("Maximum number of pending wb after "
+    post_push_wb_queue_size = Param.Int("Maximum number of pending wb after "
                                 "apply process for applications that require "
                                 "the apply process to happen exactly before "
                                 "pushing the edgePointer to the PushEngine.")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index fa5099353e..0c223a8a5b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -49,16 +49,17 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
     _workCount(0), numPullsReceived(0),
-    postApplyWBQueueSize(params.post_apply_wb_queue_size),
+    postPushWBQueueSize(params.post_push_wb_queue_size),
+    maxPotentialPostPushWB(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
     nextResponseEvent([this] {
         processNextResponseEvent();
         }, name() + ".nextResponseEvent"),
-    nextApplyEvent([this] {
-        processNextApplyEvent();
-        }, name() + ".nextApplyEvent"),
+    nextPreWBApplyEvent([this] {
+        processNextPreWBApplyEvent();
+        }, name() + ".nextPreWBApplyEvent"),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -469,7 +470,9 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     onTheFlyReqs--;
     Addr addr = pkt->getAddr();
     int block_index = getBlockIndex(addr);
+    WorkListItem* items = pkt->getPtr<WorkListItem>();
 
+    bool do_wb = false;
     if (pkt->findNextSenderState<SenderState>()) {
         assert(!((cacheBlocks[block_index].addr == addr) &&
                 (cacheBlocks[block_index].valid)));
@@ -480,7 +483,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                 "for addr %lu.\n", __func__, addr);
         int it = getBitIndexBase(addr);
         uint64_t send_mask = pendingVertexPullReads[addr];
-        WorkListItem* items = pkt->getPtr<WorkListItem>();
         // No applying of the line needed.
         for (int i = 0; i < numElementsPerLine; i++) {
             Addr vertex_addr = addr + i * sizeof(WorkListItem);
@@ -489,19 +491,30 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 assert(needsPush[it + i] == 1);
                 needsPush[it + i] = 0;
                 _workCount--;
-                owner->recvVertexPush(vertex_addr, items[i]);
+
+                uint32_t delta;
+                bool do_push, do_wb_v;
+                std::tie(delta, do_push, do_wb_v) =
+                                        graphWorkload->prePushApply(items[i]);
+                do_wb |= do_wb_v;
+                if (do_push) {
+                    owner->recvVertexPush(vertex_addr, delta,
+                                        items[i].edgeIndex, items[i].degree);
+                } else {
+                    owner->recvPrevPullCorrection();
+                }
+
                 stats.verticesPushed++;
                 stats.lastVertexPushTime = curTick() - stats.lastResetTick;
             }
         }
         pendingVertexPullReads.erase(addr);
-        delete pkt;
-        return true;
+        maxPotentialPostPushWB--;
     }
 
     if (cacheBlocks[block_index].addr == addr) {
         DPRINTF(CoalesceEngine, "%s: Received read response to "
-                "fill cacheBlocks[%d].\n", __func__, block_index);
+                        "fill cacheBlocks[%d].\n", __func__, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
         assert(!cacheBlocks[block_index].valid);
@@ -512,19 +525,30 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         assert(!cacheBlocks[block_index].pendingApply);
         assert(!cacheBlocks[block_index].pendingWB);
         assert(MSHR.find(block_index) != MSHR.end());
-        pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
-                                                peerMemoryAtomSize);
+        std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize);
         for (int i = 0; i < numElementsPerLine; i++) {
-        DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
-                            __func__, block_index, i,
-                            cacheBlocks[block_index].items[i].to_string());
+            DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
+                                __func__, block_index, i,
+                                cacheBlocks[block_index].items[i].to_string());
         }
         cacheBlocks[block_index].valid = true;
+        cacheBlocks[block_index].needsWB |= do_wb;
         cacheBlocks[block_index].pendingData = false;
         cacheBlocks[block_index].lastChangedTick = curTick();
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-        delete pkt;
+    } else if (do_wb) {
+        PacketPtr wb_pkt = createWritePacket(
+                                addr, peerMemoryAtomSize, (uint8_t*) items);
+        postPushWBQueue.emplace_back(wb_pkt, curTick());
+        memoryFunctionQueue.emplace_back(
+            [this] (int ignore, Tick schedule_tick) {
+                processNextPostPushWB(ignore, schedule_tick);
+            }, 0, curTick());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
+    } else {
+        DPRINTF(CoalesceEngine, "%s: Fuck 2.\n", __func__);
     }
 
     for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
@@ -570,6 +594,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         schedule(nextResponseEvent, nextCycle());
     }
 
+    delete pkt;
     return true;
 }
 
@@ -675,8 +700,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
             DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
                             "applyQueue.\n", __func__, block_index);
             if ((!applyQueue.empty()) &&
-                (!nextApplyEvent.scheduled())) {
-                schedule(nextApplyEvent, nextCycle());
+                (!nextPreWBApplyEvent.scheduled())) {
+                schedule(nextPreWBApplyEvent, nextCycle());
             }
         } else {
             assert(MSHR.size() <= numMSHREntries);
@@ -742,7 +767,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 }
 
 void
-CoalesceEngine::processNextApplyEvent()
+CoalesceEngine::processNextPreWBApplyEvent()
 {
     int block_index = applyQueue.front();
     DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
@@ -757,27 +782,22 @@ CoalesceEngine::processNextApplyEvent()
     if (cacheBlocks[block_index].pendingApply) {
         assert(cacheBlocks[block_index].busyMask == 0);
         for (int index = 0; index < numElementsPerLine; index++) {
-            if (graphWorkload->applyCondition(cacheBlocks[block_index].items[index])) {
-                // TODO: Implement this function
-                bool do_push =  graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
-                int bit_index_base =
-                            getBitIndexBase(cacheBlocks[block_index].addr);
-
-                if (do_push) {
-                    if (needsPush[bit_index_base + index] == 0) {
-                        _workCount++;
-                        needsPush[bit_index_base + index] = 1;
-                        activeBits.push_back(bit_index_base + index);
-                        if (!owner->running()) {
-                            owner->start();
-                        }
+            bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
+            if (do_push) {
+                int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
+                if (needsPush[bit_index_base + index] == 0) {
+                    _workCount++;
+                    needsPush[bit_index_base + index] = 1;
+                    activeBits.push_back(bit_index_base + index);
+                    if (!owner->running()) {
+                        owner->start();
                     }
                 }
             }
         }
         stats.bitvectorLength.sample(needsPush.count());
 
-        cacheBlocks[block_index].needsWB = true;
+        assert(cacheBlocks[block_index].needsWB);
         cacheBlocks[block_index].needsApply = false;
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].lastChangedTick = curTick();
@@ -810,8 +830,8 @@ CoalesceEngine::processNextApplyEvent()
 
     applyQueue.pop_front();
     if ((!applyQueue.empty()) &&
-        (!nextApplyEvent.scheduled())) {
-        schedule(nextApplyEvent, nextCycle());
+        (!nextPreWBApplyEvent.scheduled())) {
+        schedule(nextPreWBApplyEvent, nextCycle());
     }
 
     if (done()) {
@@ -870,16 +890,78 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
     assert(cacheBlocks[block_index].pendingData);
     assert(!cacheBlocks[block_index].pendingApply);
     assert(!cacheBlocks[block_index].pendingWB);
-    PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
-                                    peerMemoryAtomSize);
-    DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
-            "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
-    memPort.sendPacket(pkt);
-    onTheFlyReqs++;
-
-    if (pendingVertexPullReads.find(pkt->getAddr()) !=
+
+    bool need_send_pkt = true;
+    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
+    {
+        PacketPtr wb_pkt = std::get<0>(*wb);
+        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
+            wb_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+            cacheBlocks[block_index].needsWB = true;
+            for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+                Addr miss_addr = *it;
+                Addr aligned_miss_addr =
+                    roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
+                if (aligned_miss_addr == cacheBlocks[block_index].addr) {
+                    int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                                "cacheBlocks[%d] can be serviced with the received "
+                                "packet.\n",__func__, miss_addr, block_index);
+                    // TODO: Make this block of code into a function
+                    responseQueue.push_back(std::make_tuple(miss_addr,
+                            cacheBlocks[block_index].items[wl_offset], curTick()));
+                    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                                "to responseQueue. responseQueue.size = %d.\n",
+                                __func__, miss_addr,
+                                cacheBlocks[block_index].items[wl_offset].to_string(),
+                                responseQueue.size());
+                    DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                                "to responseQueue. responseQueue.size = %d.\n",
+                                __func__, miss_addr,
+                                cacheBlocks[block_index].items[wl_offset].to_string(),
+                                responseQueue.size());
+                    // TODO: Add a stat to count the number of WLItems that have been touched.
+                    cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                                block_index, cacheBlocks[block_index].to_string());
+                    it = MSHR[block_index].erase(it);
+                } else {
+                    it++;
+                }
+            }
+            if (MSHR[block_index].empty()) {
+                MSHR.erase(block_index);
+            }
+
+            if ((!nextResponseEvent.scheduled()) &&
+                (!responseQueue.empty())) {
+                schedule(nextResponseEvent, nextCycle());
+            }
+            postPushWBQueue.erase(wb);
+            need_send_pkt = false;
+        }
+    }
+
+    if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) !=
         pendingVertexPullReads.end()) {
-        stats.numDoubleMemReads++;
+        need_send_pkt = false;
+    }
+
+    if (need_send_pkt) {
+        PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
+                                        peerMemoryAtomSize);
+        DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
+                "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
+        memPort.sendPacket(pkt);
+        onTheFlyReqs++;
+
+        if (pendingVertexPullReads.find(pkt->getAddr()) !=
+            pendingVertexPullReads.end()) {
+            stats.numDoubleMemReads++;
+        }
     }
 }
 
@@ -948,6 +1030,18 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
     }
 }
 
+void
+CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
+{
+    PacketPtr wb_pkt;
+    Tick pkt_tick;
+    std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
+    if (schedule_tick == pkt_tick) {
+        memPort.sendPacket(wb_pkt);
+        postPushWBQueue.pop_front();
+    }
+}
+
 std::tuple<BitStatus, Addr, int>
 CoalesceEngine::getOptimalPullAddr()
 {
@@ -1017,6 +1111,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             assert(vertex_send_mask == 0);
             send_mask |= (1 << index_offset);
             pendingVertexPullReads[addr] = send_mask;
+            numPullsReceived--;
         }
         if (bit_status == BitStatus::IN_CACHE) {
             // renaming the outputs to their local names.
@@ -1030,35 +1125,39 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             needsPush[slice_base_index + wl_offset] = 0;
             _workCount--;
 
-            // TODO: Implement a function like this.
-            // uint32_t delta, bool do_wb = prePushApply(cacheBlocks[block_index].items[wl_offset]);
-            // TODO: After implementing the above function get rid of this bool
-            // if (applyBeforePush) {
-            //     cacheBlocks[block_index].items[wl_offset].prop =
-            //         cacheBlocks[block_index].items[wl_offset].tempProp;
-            // }
-            // TODO: Implement recvVertexPush2 in PushEngine.
-            // owner->recvVertexPush2(vertex_addr, delta,
-            //             cacheBlocks[block_index].items[wl_offset].edgeIndex,
-            //             cacheBlocks[block_index].items[wl_offset].degree);
-            owner->recvVertexPush(
-                    vertex_addr, cacheBlocks[block_index].items[wl_offset]);
+            uint32_t delta;
+            bool do_push, do_wb;
+            std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
+                                    cacheBlocks[block_index].items[wl_offset]);
+            cacheBlocks[block_index].needsWB |= do_wb;
+            if (do_push) {
+                owner->recvVertexPush(vertex_addr, delta,
+                        cacheBlocks[block_index].items[wl_offset].edgeIndex,
+                        cacheBlocks[block_index].items[wl_offset].degree);
+            } else {
+                DPRINTF(CoalesceEngine, "%s: Fuck!.\n", __func__);
+                owner->recvPrevPullCorrection();
+            }
             stats.verticesPushed++;
             stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+            numPullsReceived--;
         }
         if (bit_status == BitStatus::IN_MEMORY) {
-            Addr addr = location;
-            int index_offset = offset;
-            uint64_t send_mask = (1 << index_offset);
-            assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
-            PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
-            SenderState* sender_state = new SenderState(true);
-            pkt->pushSenderState(sender_state);
-            memPort.sendPacket(pkt);
-            onTheFlyReqs++;
-            pendingVertexPullReads[addr] = send_mask;
+            if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) {
+                Addr addr = location;
+                int index_offset = offset;
+                uint64_t send_mask = (1 << index_offset);
+                assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
+                PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+                SenderState* sender_state = new SenderState(true);
+                pkt->pushSenderState(sender_state);
+                memPort.sendPacket(pkt);
+                onTheFlyReqs++;
+                maxPotentialPostPushWB++;
+                pendingVertexPullReads[addr] = send_mask;
+                numPullsReceived--;
+            }
         }
-        numPullsReceived--;
     }
 
     stats.bitvectorSearchStatus[bit_status]++;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 0a2c0ca5ff..c0091a494d 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -123,14 +123,15 @@ class CoalesceEngine : public BaseMemoryEngine
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
     std::deque<int> activeBits;
-    int postApplyWBQueueSize;
-    std::deque<WorkListItem> postApplyWBQueue;
+    int postPushWBQueueSize;
+    std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
     std::tuple<BitStatus, Addr, int> getOptimalPullAddr();
 
+    int maxPotentialPostPushWB;
     // A map from addr to sendMask. sendMask determines which bytes to
     // send for push when getting the read response from memory.
     std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
@@ -140,14 +141,15 @@ class CoalesceEngine : public BaseMemoryEngine
     void processNextRead(int block_index, Tick schedule_tick);
     void processNextWriteBack(int block_index, Tick schedule_tick);
     void processNextVertexPull(int ignore, Tick schedule_tick);
+    void processNextPostPushWB(int ignore, Tick schedule_tick);
     std::deque<std::tuple<
         std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
 
     EventFunctionWrapper nextResponseEvent;
     void processNextResponseEvent();
 
-    EventFunctionWrapper nextApplyEvent;
-    void processNextApplyEvent();
+    EventFunctionWrapper nextPreWBApplyEvent;
+    void processNextPreWBApplyEvent();
 
     struct CoalesceStats : public statistics::Group
     {
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 70f1e05f32..b91aa21a53 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/mpu.hh"
 
 #include "accl/graph/sega/centeral_controller.hh"
+#include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
 
@@ -80,9 +81,16 @@ MPU::recvWorkload(GraphWorkload* workload)
 }
 
 void
-MPU::recvVertexPush(Addr addr, WorkListItem wl)
+MPU::recvVertexPush(Addr addr, uint32_t delta,
+                    uint32_t edge_index, uint32_t degree)
 {
-    pushEngine->recvVertexPush(addr, wl);
+    pushEngine->recvVertexPush(addr, delta, edge_index, degree);
+}
+
+void
+MPU::recvPrevPullCorrection()
+{
+    DPRINTF(MPU, "%s: Fuck!\n", __func__);
 }
 
 void
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 8f6101c325..8f3b29f603 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -75,9 +75,9 @@ class MPU : public SimObject
     void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
     bool running() { return pushEngine->running(); }
     void start() { return pushEngine->start(); }
-    void recvVertexPush(Addr addr, WorkListItem wl);
-    void recvVertexPush2(Addr addr, uint32_t delta,
+    void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
+    void recvPrevPullCorrection();
 
     void recvDoneSignal();
     bool done();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c54f19307f..c76567696e 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -184,18 +184,18 @@ PushEngine::processNextVertexPullEvent()
 }
 
 void
-PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
+PushEngine::recvVertexPush(Addr addr, uint32_t delta,
+                            uint32_t edge_index, uint32_t degree)
 {
-    assert(wl.degree > 0);
+    assert(degree > 0);
     assert((edgePointerQueueSize == 0) ||
             ((edgePointerQueue.size() + numPendingPulls) <= edgePointerQueueSize));
 
-    Addr start_addr = wl.edgeIndex * sizeof(Edge);
-    Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
+    Addr start_addr = edge_index * sizeof(Edge);
+    Addr end_addr = start_addr + (degree * sizeof(Edge));
+    EdgeReadInfoGen info_gen(addr, delta, start_addr, end_addr,
+                            sizeof(Edge), peerMemoryAtomSize);
 
-    // uint32_t value = calculateValue(wl);
-    EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge),
-                            peerMemoryAtomSize, addr, wl.prop);
     edgePointerQueue.emplace_back(info_gen, curTick());
 
     numPendingPulls--;
@@ -207,6 +207,7 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
         (!nextMemoryReadEvent.scheduled())) {
         schedule(nextMemoryReadEvent, nextCycle());
     }
+
 }
 
 void
@@ -229,7 +230,7 @@ PushEngine::processNextMemoryReadEvent()
                     "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
         PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
-        PushInfo push_info = {curr_info.src(), curr_info.value(), offset, num_edges};
+        PushInfo push_info = {curr_info.src(), curr_info.delta(), offset, num_edges};
         reqInfoMap[pkt->req] = push_info;
         memPort.sendPacket(pkt);
         onTheFlyMemReqs += num_edges;
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 1112176897..848c93e313 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -66,21 +66,24 @@ class PushEngine : public BaseMemoryEngine
 
     class EdgeReadInfoGen {
       private:
+        Addr _src;
+        uint32_t _delta;
+
         Addr _start;
         Addr _end;
         size_t _step;
         size_t _atom;
 
-        Addr _src;
-        uint32_t _value;
-
       public:
-        EdgeReadInfoGen(Addr start, Addr end, size_t step,
-                        size_t atom, Addr src, uint32_t value):
-                        _start(start), _end(end), _step(step),
-                        _atom(atom), _src(src), _value(value)
+        EdgeReadInfoGen(Addr src, uint32_t delta, Addr start,
+                        Addr end, size_t step, size_t atom):
+                        _src(src), _delta(delta), _start(start),
+                        _end(end), _step(step), _atom(atom)
         {}
 
+        Addr src() { return _src; }
+        uint32_t delta() { return _delta; }
+
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
         {
             panic_if(done(), "Should not call nextPacketInfo when done.\n");
@@ -105,9 +108,6 @@ class PushEngine : public BaseMemoryEngine
         }
 
         bool done() { return (_start >= _end); }
-
-        Addr src() { return _src; }
-        uint32_t value() { return _value; }
     };
     struct PushInfo {
         Addr src;
@@ -197,8 +197,7 @@ class PushEngine : public BaseMemoryEngine
 
     void start();
     bool running() { return _running; }
-    void recvVertexPush(Addr addr, WorkListItem wl);
-    void recvVertexPush2(Addr addr, uint32_t delta,
+    void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
 
     void recvReqRetry();

From 932aec66eb6997d2be580eb711f299ee41d1559b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 17 Oct 2022 08:40:47 -0700
Subject: [PATCH 197/247] Implementing correction function for PushEngine.

---
 src/accl/graph/sega/coalesce_engine.cc | 5 +++--
 src/accl/graph/sega/mpu.cc             | 2 +-
 src/accl/graph/sega/push_engine.cc     | 9 +++++++++
 src/accl/graph/sega/push_engine.hh     | 1 +
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 0c223a8a5b..441457f2e8 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -501,9 +501,9 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     owner->recvVertexPush(vertex_addr, delta,
                                         items[i].edgeIndex, items[i].degree);
                 } else {
+                    // TODO: Add a stat to count this.
                     owner->recvPrevPullCorrection();
                 }
-
                 stats.verticesPushed++;
                 stats.lastVertexPushTime = curTick() - stats.lastResetTick;
             }
@@ -548,7 +548,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             schedule(nextMemoryEvent, nextCycle());
         }
     } else {
-        DPRINTF(CoalesceEngine, "%s: Fuck 2.\n", __func__);
+        // TODO: Add a stat to count this.
+        DPRINTF(CoalesceEngine, "%s: Totally wasteful read.\n", __func__);
     }
 
     for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index b91aa21a53..b30060238d 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -90,7 +90,7 @@ MPU::recvVertexPush(Addr addr, uint32_t delta,
 void
 MPU::recvPrevPullCorrection()
 {
-    DPRINTF(MPU, "%s: Fuck!\n", __func__);
+    pushEngine->recvPrevPullCorrection();
 }
 
 void
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c76567696e..07f37a28dc 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -207,7 +207,16 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta,
         (!nextMemoryReadEvent.scheduled())) {
         schedule(nextMemoryReadEvent, nextCycle());
     }
+}
 
+void
+PushEngine::recvPrevPullCorrection()
+{
+    assert(numPendingPulls > 0);
+    numPendingPulls--;
+    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
 }
 
 void
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 848c93e313..2e1de25390 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -199,6 +199,7 @@ class PushEngine : public BaseMemoryEngine
     bool running() { return _running; }
     void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
+    void recvPrevPullCorrection();
 
     void recvReqRetry();
 

From 60ea8db3c1de4536d384c9b03e782db5739bf7b9 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 19 Oct 2022 08:03:16 -0700
Subject: [PATCH 198/247] Adding initialization to graphWorkloads

---
 configs/accl/sega-hbm.py                   |  4 +-
 src/accl/graph/base/data_structs.hh        |  2 +
 src/accl/graph/base/graph_workload.cc      | 72 ++++++++++++++++++++++
 src/accl/graph/base/graph_workload.hh      | 44 +++++++++++--
 src/accl/graph/sega/centeral_controller.cc | 22 ++-----
 src/accl/graph/sega/centeral_controller.hh |  7 ++-
 src/accl/graph/sega/coalesce_engine.cc     |  3 +-
 src/accl/graph/sega/coalesce_engine.hh     |  2 +-
 8 files changed, 128 insertions(+), 28 deletions(-)

diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
index 50fd5f3069..9078c185f3 100644
--- a/configs/accl/sega-hbm.py
+++ b/configs/accl/sega-hbm.py
@@ -57,7 +57,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                             num_mshr_entry=64,
                                             num_tgts_per_mshr=64,
                                             max_resp_per_cycle=8,
-                                            post_apply_wb_queue_size=64
+                                            post_push_wb_queue_size=64
                                             )
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
@@ -136,7 +136,7 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
     def create_initial_bfs_update(self, init_addr, init_value):
         self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-        
+
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 2d81375b63..70babf5960 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -37,6 +37,8 @@
 #include <cstring>
 #include <list>
 
+#define MAX_BITVECTOR_SIZE (1 << 28)
+
 namespace gem5
 {
 
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 6a8e000515..542f2e0221 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -31,6 +31,37 @@
 namespace gem5
 {
 
+BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size):
+    GraphWorkload(), initValue(init_value), atomSize(atom_size)
+{
+    initAddrBase = roundDown<uint64_t, int>(init_addr, atomSize);
+    initIndex = (init_addr - initAddrBase) / atomSize;
+    numElementsPerLine = atomSize / sizeof(WorkListItem);
+}
+
+
+void
+BFSWorkload::init(PacketPtr pkt, int bit_index_base,
+                std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                std::deque<int>& activeBits)
+{
+    if (pkt->getAddr() == initAddrBase) {
+        WorkListItem items[numElementsPerLine];
+
+        pkt->writeDataToBlock((uint8_t*) items, atomSize);
+
+        items[initIndex].tempProp = initValue;
+        items[initIndex].prop = initValue;
+        needsPush[bit_index_base + initIndex] = 1;
+        activeBits.push_back(bit_index_base + initIndex);
+
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, atomSize);
+    }
+
+}
+
 uint32_t
 BFSWorkload::reduce(uint32_t update, uint32_t value)
 {
@@ -68,4 +99,45 @@ BFSWorkload::prePushApply(WorkListItem& wl)
     return std::make_tuple(value, true, false);
 }
 
+
+uint32_t
+PRWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return update+value;
+}
+
+uint32_t
+PRWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return (alpha*value*weight);
+}
+
+bool
+PRWorkload::applyCondition(WorkListItem wl)
+{
+    return wl.tempProp != wl.prop;
+}
+
+bool
+PRWorkload::preWBApply(WorkListItem& wl)
+{
+    if (applyCondition(wl)) {
+        if (wl.degree > 0) {
+            return true;
+        }
+    }
+    return false;
+}
+
+std::tuple<uint32_t, bool, bool>
+PRWorkload::prePushApply(WorkListItem& wl)
+{
+    uint32_t delta = abs(wl.prop - wl.tempProp)/wl.degree;
+    if (delta > threshold) {
+        return std::make_tuple(delta, true, true);
+    }
+    uint32_t value = wl.tempProp;
+    return std::make_tuple(value, false, false);
+}
+
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index c4db5c9e2f..cc0767305a 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -29,9 +29,13 @@
 #ifndef __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
 #define  __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
 
+#include <bitset>
+#include <deque>
 #include <tuple>
 
 #include "accl/graph/base/data_structs.hh"
+#include "base/intmath.hh"
+#include "mem/packet.hh"
 
 
 namespace gem5
@@ -42,6 +46,10 @@ class GraphWorkload
   public:
     GraphWorkload() {}
     ~GraphWorkload() {}
+
+    virtual void init(PacketPtr pkt, int bit_index_base,
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::deque<int>& activeBits) = 0;
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual bool applyCondition(WorkListItem wl) = 0;
@@ -52,16 +60,42 @@ class GraphWorkload
 class BFSWorkload : public GraphWorkload
 {
   private:
-    uint64_t initAddr;
+    uint64_t initAddrBase;
+    int initIndex;
     uint32_t initValue;
+    int numElementsPerLine;
+    int atomSize;
   public:
-    BFSWorkload(uint64_t init_addr, uint32_t init_value):
-        GraphWorkload(),
-        initAddr(init_addr), initValue(init_value)
-    {}
+    BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size);
 
     ~BFSWorkload() {}
 
+    virtual void init(PacketPtr pkt, int bit_index_base,
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::deque<int>& activeBits);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual bool applyCondition(WorkListItem wl);
+    virtual bool preWBApply(WorkListItem& wl);
+    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
+};
+
+
+class PRWorkload : public GraphWorkload
+{
+  private:
+    float alpha;
+    float threshold;
+  public:
+    PRWorkload(float alpha, float threshold):
+        GraphWorkload(), alpha(alpha), threshold(threshold)
+    {}
+
+    ~PRWorkload() {}
+
+    virtual void init(PacketPtr pkt, int bit_index_base,
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::deque<int>& activeBits);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual bool applyCondition(WorkListItem wl);
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index fd282834e9..dbd1705e8a 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -51,12 +51,13 @@ CenteralController::CenteralController(const Params& params):
 }
 
 void
-CenteralController::initState()
+CenteralController::startup()
 {
     for (auto mpu: mpuVector) {
         addrRangeListMap[mpu] = mpu->getAddrRanges();
         mpu->recvWorkload(workload);
     }
+
     const auto& file = params().image_file;
     if (file == "")
         return;
@@ -79,22 +80,11 @@ CenteralController::initState()
     }, system->cacheLineSize());
 
     panic_if(!image.write(proxy), "%s: Unable to write image.");
-}
 
-void
-CenteralController::startup()
-{
-    while(!initialUpdates.empty()) {
-        PacketPtr front = initialUpdates.front();
-        for (auto mpu: mpuVector) {
-            AddrRangeList range_list = addrRangeListMap[mpu];
-            for (auto range: range_list) {
-                if (range.contains(front->getAddr())) {
-                    mpu->handleIncomingUpdate(front);
-                }
-            }
+    for (auto mpu: mpuVector) {
+        if (!mpu->running() && (mpu->workCount ()> 0)) {
+            mpu->start();
         }
-        initialUpdates.pop_front();
     }
 }
 
@@ -140,7 +130,7 @@ CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value)
 void
 CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
 {
-    workload = new BFSWorkload(init_addr, init_value);
+    workload = new BFSWorkload(init_addr, init_value, system->cacheLineSize());
 }
 
 void
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 1f1df00b4b..4c5ff28ebe 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -48,8 +48,6 @@ class CenteralController : public ClockedObject
   private:
     System* system;
 
-    GraphWorkload* workload;
-
     Addr maxVertexAddr;
     std::deque<PacketPtr> initialUpdates;
 
@@ -60,10 +58,13 @@ class CenteralController : public ClockedObject
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
   public:
+
+    GraphWorkload* workload;
+
     PARAMS(CenteralController);
     CenteralController(const CenteralControllerParams &params);
 
-    virtual void initState() override;
+    // virtual void initState() override;
     virtual void startup() override;
 
     void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 441457f2e8..b91b92c0fb 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -121,7 +121,8 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
         }
     } else {
         // TODO: Add and implement init function for GraphWorkload.
-        // graphWorkload->init(pkt);
+        int bit_index_base = getBitIndexBase(pkt->getAddr());
+        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits);
         memPort.sendFunctional(pkt);
     }
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c0091a494d..926caf46db 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -38,7 +38,7 @@
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
-#define MAX_BITVECTOR_SIZE (1 << 28)
+
 
 namespace gem5
 {

From 9b91fb71245587cfbd95e11bab0d767e571d69f3 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sat, 22 Oct 2022 12:36:32 -0700
Subject: [PATCH 199/247] Fixing algo start issue.

---
 src/accl/graph/sega/centeral_controller.cc | 2 +-
 src/accl/graph/sega/coalesce_engine.cc     | 2 +-
 src/accl/graph/sega/coalesce_engine.hh     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index dbd1705e8a..61ad7c10b4 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -82,7 +82,7 @@ CenteralController::startup()
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 
     for (auto mpu: mpuVector) {
-        if (!mpu->running() && (mpu->workCount ()> 0)) {
+        if (!mpu->running() && (mpu->workCount()> 0)) {
             mpu->start();
         }
     }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b91b92c0fb..72ceba6f89 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -1079,7 +1079,7 @@ CoalesceEngine::getOptimalPullAddr()
                 return std::make_tuple(
                             BitStatus::IN_CACHE, block_index, index_offset);
             // Otherwise if it is in memory
-            } else if (cacheBlocks[block_index].addr != addr) {
+            } else if ((!cacheBlocks[block_index].valid) || (cacheBlocks[block_index].addr != addr)) {
                 activeBits.pop_front();
                 return std::make_tuple(
                             BitStatus::IN_MEMORY, addr, index_offset);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 926caf46db..8c187f8fb8 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -209,7 +209,7 @@ class CoalesceEngine : public BaseMemoryEngine
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
-    int workCount() { return _workCount; }
+    int workCount() { return needsPush.count(); }
     void recvVertexPull();
 
     bool done();

From d4644cea189cf0deb4b7714018b2a14153c10d7b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 22 Oct 2022 13:49:41 -0700
Subject: [PATCH 200/247] Fixing block addr initialization.

---
 src/accl/graph/sega/coalesce_engine.cc | 2 +-
 src/accl/graph/sega/coalesce_engine.hh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 72ceba6f89..5b5374873c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -267,7 +267,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         // is cold and addr or aligned_addr is 0. It fails because cache block
         // addr field is initialized to 0. Unfortunately Addr type is unsigned.
         // So you can not initialized addr to -1.
-        // assert(cacheBlocks[block_index].addr != aligned_addr);
+        assert(cacheBlocks[block_index].addr != aligned_addr);
         assert(MSHR.size() <= numMSHREntries);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHR.find(block_index) == MSHR.end()) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 8c187f8fb8..e710553be1 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -73,7 +73,7 @@ class CoalesceEngine : public BaseMemoryEngine
         // Tick lastWLWriteTick;
         Block() {}
         Block(int num_elements):
-          addr(0),
+          addr(-1),
           busyMask(0),
           valid(false),
           needsApply(false),

From e2f68af811ad9a16c5d84aa678d1baf2208f9fe1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 23 Oct 2022 21:43:33 -0700
Subject: [PATCH 201/247] Adding PR.

---
 src/accl/graph/base/graph_workload.cc      | 48 ++++++++++++++++++----
 src/accl/graph/base/graph_workload.hh      | 15 ++++---
 src/accl/graph/sega/CenteralController.py  |  3 +-
 src/accl/graph/sega/centeral_controller.cc | 32 +--------------
 src/accl/graph/sega/centeral_controller.hh |  8 +---
 src/accl/graph/sega/coalesce_engine.cc     | 27 ++----------
 src/accl/graph/sega/coalesce_engine.hh     |  3 --
 7 files changed, 57 insertions(+), 79 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 542f2e0221..cbaef86a76 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -36,13 +36,13 @@ BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size)
 {
     initAddrBase = roundDown<uint64_t, int>(init_addr, atomSize);
     initIndex = (init_addr - initAddrBase) / atomSize;
-    numElementsPerLine = atomSize / sizeof(WorkListItem);
+    numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
 }
 
 
 void
 BFSWorkload::init(PacketPtr pkt, int bit_index_base,
-                std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
                 std::deque<int>& activeBits)
 {
     if (pkt->getAddr() == initAddrBase) {
@@ -99,23 +99,53 @@ BFSWorkload::prePushApply(WorkListItem& wl)
     return std::make_tuple(value, true, false);
 }
 
+PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
+    GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size)
+{
+    numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
+}
+
+void
+PRWorkload::init(PacketPtr pkt, int bit_index_base,
+                std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
+                std::deque<int>& activeBits)
+{
+    WorkListItem items[numElementsPerLine];
+
+    pkt->writeDataToBlock((uint8_t*) items, atomSize);
+    for (int i = 0; i < numElementsPerLine; i++) {
+        items[i].tempProp = readFromFloat<uint32_t>(0);
+        items[i].prop = readFromFloat<uint32_t>(1 - alpha);
+        needsPush[bit_index_base + i] = 1;
+        activeBits.push_back(bit_index_base + i);
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, atomSize);
+}
 
 uint32_t
 PRWorkload::reduce(uint32_t update, uint32_t value)
 {
-    return update+value;
+    float update_float = writeToFloat<uint32_t>(update);
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(update_float + value_float);
 }
 
 uint32_t
 PRWorkload::propagate(uint32_t value, uint32_t weight)
 {
-    return (alpha*value*weight);
+    float value_float = writeToFloat<uint32_t>(value);
+    float weight_float = writeToFloat<uint32_t>(weight);
+    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
 }
 
 bool
 PRWorkload::applyCondition(WorkListItem wl)
 {
-    return wl.tempProp != wl.prop;
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    return temp_float != prop_float;
 }
 
 bool
@@ -132,12 +162,14 @@ PRWorkload::preWBApply(WorkListItem& wl)
 std::tuple<uint32_t, bool, bool>
 PRWorkload::prePushApply(WorkListItem& wl)
 {
-    uint32_t delta = abs(wl.prop - wl.tempProp)/wl.degree;
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float delta = abs((temp_float - prop_float) / wl.degree);
     if (delta > threshold) {
+        wl.prop = wl.tempProp;
         return std::make_tuple(delta, true, true);
     }
-    uint32_t value = wl.tempProp;
-    return std::make_tuple(value, false, false);
+    return std::make_tuple(0, false, false);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index cc0767305a..831da97e71 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -48,7 +48,7 @@ class GraphWorkload
     ~GraphWorkload() {}
 
     virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
                     std::deque<int>& activeBits) = 0;
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
@@ -65,13 +65,14 @@ class BFSWorkload : public GraphWorkload
     uint32_t initValue;
     int numElementsPerLine;
     int atomSize;
+
   public:
     BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size);
 
     ~BFSWorkload() {}
 
     virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
                     std::deque<int>& activeBits);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
@@ -86,15 +87,17 @@ class PRWorkload : public GraphWorkload
   private:
     float alpha;
     float threshold;
+
+    int numElementsPerLine;
+    int atomSize;
+
   public:
-    PRWorkload(float alpha, float threshold):
-        GraphWorkload(), alpha(alpha), threshold(threshold)
-    {}
+    PRWorkload(float alpha, float threshold, int atom_size);
 
     ~PRWorkload() {}
 
     virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
                     std::deque<int>& activeBits);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 17badf9ec4..09a997696d 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -42,8 +42,7 @@ class CenteralController(ClockedObject):
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
     cxx_exports = [
-                    PyBindMethod("createInitialBFSUpdate"),
                     PyBindMethod("createBFSWorkload"),
-                    PyBindMethod("createInitialPRUpdate"),
+                    PyBindMethod("createPRWorkload"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 61ad7c10b4..57198450d4 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -103,30 +103,6 @@ CenteralController::createReadPacket(Addr addr, unsigned int size)
     return pkt;
 }
 
-template<typename T> PacketPtr
-CenteralController::createUpdatePacket(Addr addr, T value)
-{
-    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), addr, value);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) value) << 2);
-
-    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
-
-    pkt->allocate();
-
-    pkt->setLE<T>(value);
-
-    return pkt;
-}
-
-void
-CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value)
-{
-    PacketPtr update = createUpdatePacket<uint32_t>(init_addr, init_value);
-    initialUpdates.push_back(update);
-}
-
 void
 CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
 {
@@ -134,13 +110,9 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
 }
 
 void
-CenteralController::createInitialPRUpdate()
+CenteralController::createPRWorkload(float alpha, float threshold)
 {
-    for (auto mpu: mpuVector) {
-        if (!mpu->running() && (mpu->workCount() > 0)) {
-            mpu->start();
-        }
-    }
+    workload = new PRWorkload(alpha, threshold, system->cacheLineSize());
 }
 
 void
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 4c5ff28ebe..9ddb1b35f0 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -49,13 +49,11 @@ class CenteralController : public ClockedObject
     System* system;
 
     Addr maxVertexAddr;
-    std::deque<PacketPtr> initialUpdates;
 
     std::vector<MPU*> mpuVector;
     std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
 
     PacketPtr createReadPacket(Addr addr, unsigned int size);
-    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
   public:
 
@@ -63,13 +61,11 @@ class CenteralController : public ClockedObject
 
     PARAMS(CenteralController);
     CenteralController(const CenteralControllerParams &params);
-
-    // virtual void initState() override;
     virtual void startup() override;
 
-    void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
-    void createInitialPRUpdate();
+    void createPRWorkload(float alpha, float threshold);
+
     void recvDoneSignal();
 
     void printAnswerToHostSimout();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 5b5374873c..e71cc1195f 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -48,8 +48,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
-    _workCount(0), numPullsReceived(0),
-    postPushWBQueueSize(params.post_push_wb_queue_size),
+    numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size),
     maxPotentialPostPushWB(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
@@ -76,25 +75,6 @@ CoalesceEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
-// void
-// CoalesceEngine::algoInit(PacketPtr pkt)
-// {
-//     WorkListItem items[numElementsPerLine];
-
-//     if(workload == "PR") {
-//         //TODO: Add Alpha
-//         pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
-//         int bit_index_base = getBitIndexBase(pkt->getAddr());
-//         for (int i = 0; i < numElementsPerLine; i++) {
-//             items[i].tempProp = readFromFloat<uint32_t>(0);
-//             items[i].prop = readFromFloat<uint32_t>(1 - 0.2);
-//             needsPush[bit_index_base + i] = 1;
-//             activeBits.push_back(bit_index_base + i);
-//         }
-//         pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
-//     }
-// }
-
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
 {
@@ -491,7 +471,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             if (vertex_send_mask != 0) {
                 assert(needsPush[it + i] == 1);
                 needsPush[it + i] = 0;
-                _workCount--;
 
                 uint32_t delta;
                 bool do_push, do_wb_v;
@@ -550,6 +529,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         }
     } else {
         // TODO: Add a stat to count this.
+        // FIXME: This is not a totally wasteful read. e.g. all reads
+        // for pull in BFS are like this.
         DPRINTF(CoalesceEngine, "%s: Totally wasteful read.\n", __func__);
     }
 
@@ -788,7 +769,6 @@ CoalesceEngine::processNextPreWBApplyEvent()
             if (do_push) {
                 int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
                 if (needsPush[bit_index_base + index] == 0) {
-                    _workCount++;
                     needsPush[bit_index_base + index] = 1;
                     activeBits.push_back(bit_index_base + index);
                     if (!owner->running()) {
@@ -1125,7 +1105,6 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             int slice_base_index = getBitIndexBase(addr);
 
             needsPush[slice_base_index + wl_offset] = 0;
-            _workCount--;
 
             uint32_t delta;
             bool do_push, do_wb;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index e710553be1..c8fec38e5b 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -116,9 +116,6 @@ class CoalesceEngine : public BaseMemoryEngine
     int maxRespPerCycle;
     std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
-    bool applyBeforeWB;
-    bool applyBeforePush;
-    int _workCount;
     int numPullsReceived;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;

From bb31571e3cab67431ddbd146174997e87716b00b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 23 Oct 2022 22:14:05 -0700
Subject: [PATCH 202/247] Prepping for PR.

---
 configs/accl/sega-hbm.py               | 10 +++++-----
 src/accl/graph/sega/coalesce_engine.cc |  2 ++
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
index 9078c185f3..1c9276f0a0 100644
--- a/configs/accl/sega-hbm.py
+++ b/configs/accl/sega-hbm.py
@@ -134,12 +134,12 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
-    def create_initial_bfs_update(self, init_addr, init_value):
-        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
+    def create_pr_workload(self, alpha, threshold):
+        self.ctrl.createPRWorkload(alpha, threshold)
+
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
 
@@ -169,8 +169,8 @@ def get_inputs():
 
     m5.instantiate()
 
-    system.create_initial_bfs_update(init_addr, init_value)
-    system.create_bfs_workload(init_addr, init_value)
+    # system.create_bfs_workload(init_addr, init_value)
+    system.create_pr_workload(0.2, 0.0000001)
     exit_event = m5.simulate()
     print(f"Exited simulation at tick {m5.curTick()} " + \
             f"because {exit_event.getCause()}")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index e71cc1195f..2d5445093a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -577,6 +577,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         schedule(nextResponseEvent, nextCycle());
     }
 
+
+    // TODO: Probably check for done here too.
     delete pkt;
     return true;
 }

From 9c1f57e6d82ebbf5d3dd7b23e8a5cb0912fb04b4 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 25 Oct 2022 13:52:56 -0700
Subject: [PATCH 203/247] Adding print function to GraphWorkload class.

---
 src/accl/graph/base/data_structs.hh        | 21 -----------
 src/accl/graph/base/graph_workload.cc      | 44 ++++++++++++++++++++++
 src/accl/graph/base/graph_workload.hh      |  4 +-
 src/accl/graph/sega/centeral_controller.cc |  4 +-
 4 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 70babf5960..d9028e2f10 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -34,7 +34,6 @@
 
 #include <algorithm>
 #include <cassert>
-#include <cstring>
 #include <list>
 
 #define MAX_BITVECTOR_SIZE (1 << 28)
@@ -181,26 +180,6 @@ class UniqueFIFO
     }
 };
 
-template<typename T>
-float
-writeToFloat(T value)
-{
-    assert(sizeof(T) == sizeof(float));
-    float float_form;
-    std::memcpy(&float_form, &value, sizeof(float));
-    return float_form;
-}
-
-template<typename T>
-T
-readFromFloat(float value)
-{
-    assert(sizeof(T) == sizeof(float));
-    T float_bits;
-    std::memcpy(&float_bits, &value, sizeof(float));
-    return float_bits;
-}
-
 }
 
 #endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index cbaef86a76..ead32c0eb8 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -28,9 +28,34 @@
 
 #include "accl/graph/base/graph_workload.hh"
 
+#include <cstring>
+
+#include "base/cprintf.hh"
+#include "base/intmath.hh"
+
 namespace gem5
 {
 
+template<typename T>
+float
+writeToFloat(T value)
+{
+    assert(sizeof(T) == sizeof(float));
+    float float_form;
+    std::memcpy(&float_form, &value, sizeof(float));
+    return float_form;
+}
+
+template<typename T>
+T
+readFromFloat(float value)
+{
+    assert(sizeof(T) == sizeof(float));
+    T float_bits;
+    std::memcpy(&float_bits, &value, sizeof(float));
+    return float_bits;
+}
+
 BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size):
     GraphWorkload(), initValue(init_value), atomSize(atom_size)
 {
@@ -99,6 +124,15 @@ BFSWorkload::prePushApply(WorkListItem& wl)
     return std::make_tuple(value, true, false);
 }
 
+std::string
+BFSWorkload::printWorkListItem(const WorkListItem wl)
+{
+    return csprintf(
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
+            );
+}
+
 PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
     GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size)
 {
@@ -172,4 +206,14 @@ PRWorkload::prePushApply(WorkListItem& wl)
     return std::make_tuple(0, false, false);
 }
 
+std::string
+PRWorkload::printWorkListItem(const WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    return csprintf(
+            "WorkListItem{tempProp: %f, prop: %u, degree: %u, edgeIndex: %u}",
+            temp_float, temp_float, wl.degree, wl.edgeIndex
+            );
+}
+
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 831da97e71..c391a80c23 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -34,7 +34,6 @@
 #include <tuple>
 
 #include "accl/graph/base/data_structs.hh"
-#include "base/intmath.hh"
 #include "mem/packet.hh"
 
 
@@ -55,6 +54,7 @@ class GraphWorkload
     virtual bool applyCondition(WorkListItem wl) = 0;
     virtual bool preWBApply(WorkListItem& wl) = 0;
     virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl) = 0;
+    virtual std::string printWorkListItem(const WorkListItem wl) = 0;
 };
 
 class BFSWorkload : public GraphWorkload
@@ -79,6 +79,7 @@ class BFSWorkload : public GraphWorkload
     virtual bool applyCondition(WorkListItem wl);
     virtual bool preWBApply(WorkListItem& wl);
     virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
 
@@ -104,6 +105,7 @@ class PRWorkload : public GraphWorkload
     virtual bool applyCondition(WorkListItem wl);
     virtual bool preWBApply(WorkListItem& wl);
     virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
 }
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 57198450d4..fc2262e111 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -144,8 +144,8 @@ CenteralController::printAnswerToHostSimout()
         }
         pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
         for (int i = 0; i < num_items; i++) {
-            std::string print = csprintf("WorkListItem[%lu][%d]: %s.",
-                                        addr, i, items[i].to_string());
+            std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i,
+                                        workload->printWorkListItem(items[i]));
 
             std::cout << print << std::endl;
         }

From 95c676bd0ec2ddacf512945b4de454bd91f52f6c Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 25 Oct 2022 16:48:11 -0700
Subject: [PATCH 204/247] Updating PR

---
 src/accl/graph/base/graph_workload.cc  | 36 +++++++++--------
 src/accl/graph/sega/coalesce_engine.cc | 53 ++++++++++++++++----------
 src/accl/graph/sega/wl_engine.cc       | 10 ++---
 3 files changed, 58 insertions(+), 41 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index ead32c0eb8..9f7e5fc4c5 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -77,8 +77,10 @@ BFSWorkload::init(PacketPtr pkt, int bit_index_base,
 
         items[initIndex].tempProp = initValue;
         items[initIndex].prop = initValue;
-        needsPush[bit_index_base + initIndex] = 1;
-        activeBits.push_back(bit_index_base + initIndex);
+        if (items[initIndex].degree > 0) {
+            needsPush[bit_index_base + initIndex] = 1;
+            activeBits.push_back(bit_index_base + initIndex);
+        }
 
         pkt->deleteData();
         pkt->allocate();
@@ -150,8 +152,10 @@ PRWorkload::init(PacketPtr pkt, int bit_index_base,
     for (int i = 0; i < numElementsPerLine; i++) {
         items[i].tempProp = readFromFloat<uint32_t>(0);
         items[i].prop = readFromFloat<uint32_t>(1 - alpha);
-        needsPush[bit_index_base + i] = 1;
-        activeBits.push_back(bit_index_base + i);
+        if (items[i].degree > 0) {
+            needsPush[bit_index_base + i] = 1;
+            activeBits.push_back(bit_index_base + i);
+        }
     }
     pkt->deleteData();
     pkt->allocate();
@@ -170,7 +174,7 @@ uint32_t
 PRWorkload::propagate(uint32_t value, uint32_t weight)
 {
     float value_float = writeToFloat<uint32_t>(value);
-    float weight_float = writeToFloat<uint32_t>(weight);
+    float weight_float = writeToFloat<uint32_t>(1);
     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
 }
 
@@ -179,27 +183,27 @@ PRWorkload::applyCondition(WorkListItem wl)
 {
     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
     float prop_float = writeToFloat<uint32_t>(wl.prop);
-    return temp_float != prop_float;
+    float dist = std::abs(temp_float - prop_float);
+    return dist >= threshold;
 }
 
 bool
 PRWorkload::preWBApply(WorkListItem& wl)
 {
-    if (applyCondition(wl)) {
-        if (wl.degree > 0) {
-            return true;
-        }
+    if (applyCondition(wl) && (wl.degree > 0)) {
+        return true;
     }
     return false;
 }
 
 std::tuple<uint32_t, bool, bool>
 PRWorkload::prePushApply(WorkListItem& wl)
-{
-    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-    float prop_float = writeToFloat<uint32_t>(wl.prop);
-    float delta = abs((temp_float - prop_float) / wl.degree);
-    if (delta > threshold) {
+{ 
+    if (applyCondition(wl)) {
+        float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+        float prop_float = writeToFloat<uint32_t>(wl.prop);
+        float delta = (temp_float - prop_float) / wl.degree;
+        std::cout << "PRWorkload: delta: " << delta << std::endl;
         wl.prop = wl.tempProp;
         return std::make_tuple(delta, true, true);
     }
@@ -211,7 +215,7 @@ PRWorkload::printWorkListItem(const WorkListItem wl)
 {
     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
     return csprintf(
-            "WorkListItem{tempProp: %f, prop: %u, degree: %u, edgeIndex: %u}",
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
             temp_float, temp_float, wl.degree, wl.edgeIndex
             );
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 2d5445093a..0d1eecf43f 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -157,7 +157,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                         "%lu, and wl_offset: %d.\n", __func__, addr,
                         block_index, aligned_addr, wl_offset);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                block_index, cacheBlocks[block_index].to_string());
+                        block_index, cacheBlocks[block_index].to_string());
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
@@ -176,15 +176,17 @@ CoalesceEngine::recvWLRead(Addr addr)
             addr, cacheBlocks[block_index].items[wl_offset], curTick()));
 
         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d.\n",
-                        __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size());
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                        cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
         DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d.\n",
-                        __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size());
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
         // TODO: Stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         // If they are scheduled for apply and WB those schedules should be
@@ -476,6 +478,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 bool do_push, do_wb_v;
                 std::tie(delta, do_push, do_wb_v) =
                                         graphWorkload->prePushApply(items[i]);
+                std::cout << "CoalesceEngine: delta: " << delta << std::endl;
                 do_wb |= do_wb_v;
                 if (do_push) {
                     owner->recvVertexPush(vertex_addr, delta,
@@ -508,8 +511,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize);
         for (int i = 0; i < numElementsPerLine; i++) {
             DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
-                                __func__, block_index, i,
-                                cacheBlocks[block_index].items[i].to_string());
+                __func__, block_index, i, graphWorkload->printWorkListItem(
+                                        cacheBlocks[block_index].items[i]));
         }
         cacheBlocks[block_index].valid = true;
         cacheBlocks[block_index].needsWB |= do_wb;
@@ -550,12 +553,14 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d.\n",
                         __func__, miss_addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(),
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
                         responseQueue.size());
             DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d.\n",
                         __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(),
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
                         responseQueue.size());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
@@ -603,7 +608,9 @@ CoalesceEngine::processNextResponseEvent()
         num_responses_sent++;
         DPRINTF(CoalesceEngine,
                     "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
-                    __func__, worklist_response.to_string(), addr_response);
+                    __func__, 
+                    graphWorkload->printWorkListItem(worklist_response), 
+                    addr_response);
 
         responseQueue.pop_front();
         DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
@@ -640,12 +647,13 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CoalesceEngine,  "%s: Received a write request for addr: %lu with "
                         "wl: %s. This request maps to cacheBlocks[%d], "
                         "aligned_addr: %lu, and wl_offset: %d.\n",
-                        __func__, addr, wl.to_string(),
+                        __func__, addr, graphWorkload->printWorkListItem(wl),
                         block_index, aligned_addr, wl_offset);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
     DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
-                "with Addr: %lu.\n", __func__, wl.to_string(), addr);
+                "with Addr: %lu.\n", __func__, 
+                graphWorkload->printWorkListItem(wl), addr);
     // Desing does not allow for write misses for now.
     assert(cacheBlocks[block_index].addr == aligned_addr);
     // cache state asserts
@@ -666,13 +674,15 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     cacheBlocks[block_index].items[wl_offset] = wl;
     if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) {
         cacheBlocks[block_index].needsApply |= true;
+        cacheBlocks[block_index].needsWB |= true;
     }
 
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
     cacheBlocks[block_index].lastChangedTick = curTick();
     DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
                 __func__, block_index, wl_offset,
-                cacheBlocks[block_index].items[wl_offset].to_string());
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]));
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
 
@@ -899,12 +909,14 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
                     DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                                 "to responseQueue. responseQueue.size = %d.\n",
                                 __func__, miss_addr,
-                                cacheBlocks[block_index].items[wl_offset].to_string(),
+                                graphWorkload->printWorkListItem(
+                                    cacheBlocks[block_index].items[wl_offset]),
                                 responseQueue.size());
                     DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
                                 "to responseQueue. responseQueue.size = %d.\n",
                                 __func__, miss_addr,
-                                cacheBlocks[block_index].items[wl_offset].to_string(),
+                                graphWorkload->printWorkListItem(
+                                    cacheBlocks[block_index].items[wl_offset]),
                                 responseQueue.size());
                     // TODO: Add a stat to count the number of WLItems that have been touched.
                     cacheBlocks[block_index].busyMask |= (1 << wl_offset);
@@ -1061,7 +1073,7 @@ CoalesceEngine::getOptimalPullAddr()
                 return std::make_tuple(
                             BitStatus::IN_CACHE, block_index, index_offset);
             // Otherwise if it is in memory
-            } else if ((!cacheBlocks[block_index].valid) || (cacheBlocks[block_index].addr != addr)) {
+            } else if ((cacheBlocks[block_index].addr != addr)) {
                 activeBits.pop_front();
                 return std::make_tuple(
                             BitStatus::IN_MEMORY, addr, index_offset);
@@ -1112,6 +1124,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             bool do_push, do_wb;
             std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
                                     cacheBlocks[block_index].items[wl_offset]);
+            std::cout << "CoalesceEngine: delta: " << delta << std::endl;
             cacheBlocks[block_index].needsWB |= do_wb;
             if (do_push) {
                 owner->recvVertexPush(vertex_addr, delta,
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 85fe9be2ca..a698f2cc0a 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -263,10 +263,10 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
     workListFile[addr] = wl;
     DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) to "
                 "workListFile. workListFile.size = %d.\n", __func__, addr,
-                                    wl.to_string(), workListFile.size());
+                graphWorkload->printWorkListItem(wl), workListFile.size());
     DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to "
                 "workListFile. workListFile.size = %d.\n", __func__, addr,
-                                    wl.to_string(), workListFile.size());
+                graphWorkload->printWorkListItem(wl), workListFile.size());
 
     stats.vertexReadLatency.sample(
         ((curTick() - vertexReadTime[addr]) * 1e9) / getClockFrequency());
@@ -287,13 +287,13 @@ WLEngine::processNextReduceEvent()
         uint32_t update_value = registerFile[addr];
         DPRINTF(WLEngine,  "%s: Reducing between registerFile and workListFile"
                     ". registerFile[%lu] = %u, workListFile[%lu] = %s.\n",
-                                        __func__, addr, registerFile[addr],
-                                        addr, workListFile[addr].to_string());
+                    __func__, addr, registerFile[addr], addr,
+                    graphWorkload->printWorkListItem(workListFile[addr]));
         // TODO: Generalize this to reduce function rather than just min
         workListFile[addr].tempProp =
             graphWorkload->reduce(update_value, workListFile[addr].tempProp);
         DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
-                            __func__, addr, workListFile[addr].to_string());
+        __func__, addr, graphWorkload->printWorkListItem(workListFile[addr]));
         stats.numReduce++;
 
         owner->recvWLWrite(addr, workListFile[addr]);

From 166c3ac21df0a8175334dc8c426309e603d81b03 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 26 Oct 2022 07:11:05 -0700
Subject: [PATCH 205/247] Updating configs for pr and bfs. Fixing bugs for pr.

---
 configs/accl/bfs.py                    |  78 +++++++++++
 configs/accl/pr.py                     |  78 +++++++++++
 configs/accl/real-graph-gen.py         |  41 ++++--
 configs/accl/sega-hbm.py               | 178 -------------------------
 configs/accl/sega.py                   | 137 +++++++++----------
 configs/accl/synth-graph-gen.py        |  88 ++++++++----
 src/accl/graph/base/graph_workload.cc  |  10 +-
 src/accl/graph/sega/coalesce_engine.cc |  24 ++--
 8 files changed, 332 insertions(+), 302 deletions(-)
 create mode 100644 configs/accl/bfs.py
 create mode 100644 configs/accl/pr.py
 delete mode 100644 configs/accl/sega-hbm.py

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
new file mode 100644
index 0000000000..d02faa96ca
--- /dev/null
+++ b/configs/accl/bfs.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from sega import SEGA
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=float)
+    argparser.add_argument("init_value", type=float)
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.alpha,
+        args.threshold,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    num_gpts, cache_size, graph, init_addr, init_value, verify = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.create_bfs_workload(init_addr, init_value)
+    exit_event = m5.simulate()
+    print(
+        f"Exited simulation at tick {m5.curTick()} "
+        + f"because {exit_event.getCause()}"
+    )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
new file mode 100644
index 0000000000..59e8b924c6
--- /dev/null
+++ b/configs/accl/pr.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from sega import SEGA
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.alpha,
+        args.threshold,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    num_gpts, cache_size, graph, alpha, threshold, verify = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.create_pr_workload(alpha, threshold)
+    exit_event = m5.simulate()
+    print(
+        f"Exited simulation at tick {m5.curTick()} "
+        + f"because {exit_event.getCause()}"
+    )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/real-graph-gen.py b/configs/accl/real-graph-gen.py
index db44c63a9a..b943a925c1 100644
--- a/configs/accl/real-graph-gen.py
+++ b/configs/accl/real-graph-gen.py
@@ -28,14 +28,20 @@
 import argparse
 import subprocess
 
+
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("path", type=str, help="Path to the graph file.")
-    argparser.add_argument("num_gpts", type=int, help="Number gpts to create synth graph binaries for.")
+    argparser.add_argument(
+        "num_gpts",
+        type=int,
+        help="Number gpts to create synth graph binaries for.",
+    )
 
     args = argparser.parse_args()
     return args.path, args.num_gpts
 
+
 if __name__ == "__main__":
     graph_path, num_gpts = get_inputs()
 
@@ -59,16 +65,29 @@ def get_inputs():
         print(f"Created {graph_dir}/binaries/gpts_{num_gpts}")
 
     expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
-    if not all([binary in os.listdir(f"{graph_dir}/binaries/gpts_{num_gpts}") for binary in expected_bins]):
-        print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}")
+    if not all(
+        [
+            binary in os.listdir(f"{graph_dir}/binaries/gpts_{num_gpts}")
+            for binary in expected_bins
+        ]
+    ):
+        print(
+            f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}"
+        )
         for delete in os.scandir(f"{graph_dir}/binaries/gpts_{num_gpts}"):
             os.remove(delete.path)
         print(f"Deleted all the files in {graph_dir}/binaries/gpts_{num_gpts}")
-        subprocess.run([f"{graph_reader}" ,
-                        f"{graph_path}",
-                        "false",
-                        f"{num_gpts}",
-                        "32",
-                        f"{graph_dir}/binaries/gpts_{num_gpts}"])
-        print(f"Created the graph binaries in "
-                f"{graph_dir}/binaries/gpts_{num_gpts}")
+        subprocess.run(
+            [
+                f"{graph_reader}",
+                f"{graph_path}",
+                "false",
+                f"{num_gpts}",
+                "32",
+                f"{graph_dir}/binaries/gpts_{num_gpts}",
+            ]
+        )
+        print(
+            f"Created the graph binaries in "
+            f"{graph_dir}/binaries/gpts_{num_gpts}"
+        )
diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
deleted file mode 100644
index 1c9276f0a0..0000000000
--- a/configs/accl/sega-hbm.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import m5
-import argparse
-
-from math import log
-from m5.objects import *
-
-def interleave_addresses(plain_range, num_channels, cache_line_size):
-        intlv_low_bit = log(cache_line_size, 2)
-        intlv_bits = log(num_channels, 2)
-        ret = []
-        for i in range(num_channels):
-            ret.append(AddrRange(
-                start=plain_range.start,
-                size=plain_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                xorHighBit=0,
-                intlvBits=intlv_bits,
-                intlvMatch=i))
-        return ret, intlv_low_bit + intlv_bits - 1
-
-class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str, cache_size: str):
-        super().__init__()
-        self.wl_engine = WLEngine(
-                                update_queue_size=128,
-                                register_file_size=64
-                                )
-        self.coalesce_engine = CoalesceEngine(
-                                            attached_memory_atom_size=32,
-                                            cache_size=cache_size,
-                                            num_mshr_entry=64,
-                                            num_tgts_per_mshr=64,
-                                            max_resp_per_cycle=8,
-                                            post_push_wb_queue_size=64
-                                            )
-        self.push_engine = PushEngine(
-                                    push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=512,
-                                    update_queue_size=32
-                                    )
-
-        self.vertex_mem_ctrl = HBMCtrl(dram=HBM_2000_4H_1x64(),
-                                        dram_2=HBM_2000_4H_1x64())
-
-        self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
-                                            range=AddrRange(edge_memory_size),
-                                            in_addr_map=False
-                                                    )
-                                    )
-
-        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-        self.push_engine.mem_port = self.edge_mem_ctrl.port
-
-        self.mpu = MPU(
-                    wl_engine=self.wl_engine,
-                    coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
-                    )
-
-    def getRespPort(self):
-        return self.wl_engine.in_ports
-    def setRespPort(self, port):
-        self.wl_engine.in_ports = port
-
-    def getReqPort(self):
-        return self.push_engine.out_ports
-    def setReqPort(self, port):
-        self.push_engine.out_ports = port
-
-    def set_vertex_range(self, vertex_ranges):
-        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
-        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
-    def set_vertex_pch_bit(self, pch_bit):
-        self.vertex_mem_ctrl.pch_bit = pch_bit
-    def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.dram.image_file = edge_image
-
-class SEGA(System):
-    def __init__(self, num_mpus, cache_size, graph_path):
-        super(SEGA, self).__init__()
-        self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
-        self.clk_domain.voltage_domain = VoltageDomain()
-        self.cache_line_size = 32
-        self.mem_mode = "timing"
-
-        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
-
-        vertex_ranges, pch_bit = interleave_addresses(
-                                            AddrRange(start=0, size="4GiB"),
-                                            2*num_mpus,
-                                            32
-                                            )
-
-        gpts = []
-        for i in range(num_mpus):
-            gpt = GPT("2GiB", cache_size)
-            gpt.set_vertex_range([vertex_ranges[i], vertex_ranges[i+num_mpus]])
-            gpt.set_vertex_pch_bit(pch_bit)
-            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
-            gpts.append(gpt)
-        # Creating the interconnect among mpus
-        for gpt_0 in gpts:
-            for gpt_1 in gpts:
-                gpt_0.setReqPort(gpt_1.getRespPort())
-        self.gpts = gpts
-
-        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
-
-    def create_bfs_workload(self, init_addr, init_value):
-        self.ctrl.createBFSWorkload(init_addr, init_value)
-
-    def create_pr_workload(self, alpha, threshold):
-        self.ctrl.createPRWorkload(alpha, threshold)
-
-    def print_answer(self):
-        self.ctrl.printAnswerToHostSimout()
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-    argparser.add_argument("--verify", type=bool, help="Print final answer")
-
-    args = argparser.parse_args()
-
-    verify = False
-    if not args.verify is None:
-        verify = args.verify
-
-    return args.num_gpts, args.cache_size, \
-        args.graph, args.init_addr, args.init_value, verify
-
-if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value, verify = get_inputs()
-
-    system = SEGA(num_gpts, cache_size, graph)
-    root = Root(full_system = False, system = system)
-
-    m5.instantiate()
-
-    # system.create_bfs_workload(init_addr, init_value)
-    system.create_pr_workload(0.2, 0.0000001)
-    exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} " + \
-            f"because {exit_event.getCause()}")
-    if verify:
-        system.print_answer()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index c50c525297..42c07e2e94 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -24,100 +24,111 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import m5
-import argparse
-
 from math import log
 from m5.objects import *
 
+
 def interleave_addresses(plain_range, num_channels, cache_line_size):
-        intlv_low_bit = log(cache_line_size, 2)
-        intlv_bits = log(num_channels, 2)
-        ret = []
-        for i in range(num_channels):
-            ret.append(AddrRange(
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
                 start=plain_range.start,
                 size=plain_range.size(),
                 intlvHighBit=intlv_low_bit + intlv_bits - 1,
                 xorHighBit=0,
                 intlvBits=intlv_bits,
-                intlvMatch=i))
-        return ret
+                intlvMatch=i,
+            )
+        )
+    return ret, intlv_low_bit + intlv_bits - 1
+
 
 class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(
-                                update_queue_size=128,
-                                register_file_size=64
-                                )
+        self.wl_engine = WLEngine(update_queue_size=128, register_file_size=64)
         self.coalesce_engine = CoalesceEngine(
-                                            attached_memory_atom_size=32,
-                                            cache_size=cache_size,
-                                            num_mshr_entry=64,
-                                            num_tgts_per_mshr=64,
-                                            max_resp_per_cycle=8
-                                            )
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            num_mshr_entry=64,
+            num_tgts_per_mshr=64,
+            max_resp_per_cycle=8,
+            post_push_wb_queue_size=64,
+        )
         self.push_engine = PushEngine(
-                                    push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=64,
-                                    update_queue_size=32
-                                    )
-
-        self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
-
-        self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
-                                            range=AddrRange(edge_memory_size),
-                                            in_addr_map=False
-                                                    )
-                                    )
+            Xpush_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=512,
+            update_queue_size=32,
+        )
+
+        self.vertex_mem_ctrl = HBMCtrl(
+            dram=HBM_2000_4H_1x64(), dram_2=HBM_2000_4H_1x64()
+        )
+
+        self.edge_mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(
+                range=AddrRange(edge_memory_size), in_addr_map=False
+            )
+        )
 
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
 
         self.mpu = MPU(
-                    wl_engine=self.wl_engine,
-                    coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
-                    )
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
 
     def getRespPort(self):
         return self.wl_engine.in_ports
+
     def setRespPort(self, port):
         self.wl_engine.in_ports = port
 
     def getReqPort(self):
         return self.push_engine.out_ports
+
     def setReqPort(self, port):
         self.push_engine.out_ports = port
 
-    def set_vertex_range(self, vertex_range):
-        self.vertex_mem_ctrl.dram.range = vertex_range
+    def set_vertex_range(self, vertex_ranges):
+        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+
+    def set_vertex_pch_bit(self, pch_bit):
+        self.vertex_mem_ctrl.pch_bit = pch_bit
+
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
 
+
 class SEGA(System):
     def __init__(self, num_mpus, cache_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
+        self.clk_domain.clock = "2GHz"
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
         self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
 
-        vertex_ranges = interleave_addresses(
-                                        AddrRange(start=0, size="4GiB"),
-                                        num_mpus,
-                                        32
-                                        )
+        vertex_ranges, pch_bit = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), 2 * num_mpus, 32
+        )
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("8GiB", cache_size)
-            gpt.set_vertex_range(vertex_ranges[i])
+            gpt = GPT("2GiB", cache_size)
+            gpt.set_vertex_range(
+                [vertex_ranges[i], vertex_ranges[i + num_mpus]]
+            )
+            gpt.set_vertex_pch_bit(pch_bit)
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpts.append(gpt)
         # Creating the interconnect among mpus
@@ -128,31 +139,11 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
-    def create_initial_bfs_update(self, init_addr, init_value):
-        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-
-    args = argparser.parse_args()
-
-    return args.num_gpts, args.cache_size, \
-        args.graph, args.init_addr, args.init_value
-
-if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
-
-    system = SEGA(num_gpts, cache_size, graph)
-    root = Root(full_system = False, system = system)
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.createBFSWorkload(init_addr, init_value)
 
-    m5.instantiate()
+    def create_pr_workload(self, alpha, threshold):
+        self.ctrl.createPRWorkload(alpha, threshold)
 
-    system.create_initial_bfs_update(init_addr, init_value)
-    exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} " + \
-            f"because {exit_event.getCause()}")
+    def print_answer(self):
+        self.ctrl.printAnswerToHostSimout()
diff --git a/configs/accl/synth-graph-gen.py b/configs/accl/synth-graph-gen.py
index 16985b3537..15e4a6eff2 100644
--- a/configs/accl/synth-graph-gen.py
+++ b/configs/accl/synth-graph-gen.py
@@ -28,15 +28,27 @@
 import argparse
 import subprocess
 
+
 def get_inputs():
     argparser = argparse.ArgumentParser()
-    argparser.add_argument("scale", type=int, help="The scale of the synth graph to generate.")
-    argparser.add_argument("deg", type=int, help="The average degree of the synth graph to generate.")
-    argparser.add_argument("num_gpts", type=int, help="Number gpts to create synth graph binaries for.")
+    argparser.add_argument(
+        "scale", type=int, help="The scale of the synth graph to generate."
+    )
+    argparser.add_argument(
+        "deg",
+        type=int,
+        help="The average degree of the synth graph to generate.",
+    )
+    argparser.add_argument(
+        "num_gpts",
+        type=int,
+        help="Number gpts to create synth graph binaries for.",
+    )
 
     args = argparser.parse_args()
     return args.scale, args.deg, args.num_gpts
 
+
 if __name__ == "__main__":
     scale, deg, num_gpts = get_inputs()
 
@@ -62,18 +74,27 @@ def get_inputs():
         for delete in os.scandir(graph_path):
             os.remove(delete.path)
         print(f"Deleted everything in {graph_path}")
-        subprocess.run([f"{graph_gen}",
-                        f"{scale}",
-                        f"{deg}",
-                        f"{graph_path}/graph_unordered.txt"])
-        print(f"Generated a graph with scale "
-            f"{scale} and deg {deg}")
-        subprocess.run(["python",
-                        f"{graph_sorter}",
-                        f"{graph_path}/graph_unordered.txt",
-                        f"{graph_path}/graph.txt"])
-        print(f"Sorted the graph here {graph_path}/graph_unordered.txt"
-                                f" and saved in {graph_path}/graph.txt")
+        subprocess.run(
+            [
+                f"{graph_gen}",
+                f"{scale}",
+                f"{deg}",
+                f"{graph_path}/graph_unordered.txt",
+            ]
+        )
+        print(f"Generated a graph with scale " f"{scale} and deg {deg}")
+        subprocess.run(
+            [
+                "python",
+                f"{graph_sorter}",
+                f"{graph_path}/graph_unordered.txt",
+                f"{graph_path}/graph.txt",
+            ]
+        )
+        print(
+            f"Sorted the graph here {graph_path}/graph_unordered.txt"
+            f" and saved in {graph_path}/graph.txt"
+        )
         subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"])
         print(f"Deleted {graph_path}/graph_unordered.txt")
 
@@ -88,16 +109,31 @@ def get_inputs():
         print(f"Created {graph_path}/binaries/gpts_{num_gpts}")
 
     expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
-    if not all([binary in os.listdir(f"{graph_path}/binaries/gpts_{num_gpts}") for binary in expected_bins]):
-        print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}")
+    if not all(
+        [
+            binary in os.listdir(f"{graph_path}/binaries/gpts_{num_gpts}")
+            for binary in expected_bins
+        ]
+    ):
+        print(
+            f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}"
+        )
         for delete in os.scandir(f"{graph_path}/binaries/gpts_{num_gpts}"):
             os.remove(delete.path)
-        print(f"Deleted all the files in {graph_path}/binaries/gpts_{num_gpts}")
-        subprocess.run([f"{graph_reader}" ,
-                        f"{graph_path}/graph.txt",
-                        "false",
-                        f"{num_gpts}",
-                        "32",
-                        f"{graph_path}/binaries/gpts_{num_gpts}"])
-        print(f"Created the graph binaries in "
-                f"{graph_path}/binaries/gpts_{num_gpts}")
+        print(
+            f"Deleted all the files in {graph_path}/binaries/gpts_{num_gpts}"
+        )
+        subprocess.run(
+            [
+                f"{graph_reader}",
+                f"{graph_path}/graph.txt",
+                "false",
+                f"{num_gpts}",
+                "32",
+                f"{graph_path}/binaries/gpts_{num_gpts}",
+            ]
+        )
+        print(
+            f"Created the graph binaries in "
+            f"{graph_path}/binaries/gpts_{num_gpts}"
+        )
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 9f7e5fc4c5..e362d605c0 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -174,7 +174,9 @@ uint32_t
 PRWorkload::propagate(uint32_t value, uint32_t weight)
 {
     float value_float = writeToFloat<uint32_t>(value);
-    float weight_float = writeToFloat<uint32_t>(1);
+    float weight_float = 1.0;
+    float delta = alpha * value_float * weight_float;
+
     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
 }
 
@@ -198,14 +200,14 @@ PRWorkload::preWBApply(WorkListItem& wl)
 
 std::tuple<uint32_t, bool, bool>
 PRWorkload::prePushApply(WorkListItem& wl)
-{ 
+{
     if (applyCondition(wl)) {
         float temp_float = writeToFloat<uint32_t>(wl.tempProp);
         float prop_float = writeToFloat<uint32_t>(wl.prop);
         float delta = (temp_float - prop_float) / wl.degree;
-        std::cout << "PRWorkload: delta: " << delta << std::endl;
+        uint32_t delta_uint = readFromFloat<uint32_t>(delta);
         wl.prop = wl.tempProp;
-        return std::make_tuple(delta, true, true);
+        return std::make_tuple(delta_uint, true, true);
     }
     return std::make_tuple(0, false, false);
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 0d1eecf43f..2f6555602c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -196,7 +196,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].pendingWB = false;
         // HACK: If a read happens on the same cycle as another operation such
-        // apply setLastChangedTick to half a cycle later so that operations
+        // as apply set lastChangedTick to half a cycle later so that operation
         // scheduled by the original operation (apply in this example) are
         // invalidated. For more details refer to "accl/graph/sega/busyMaskErr"
         cacheBlocks[block_index].lastChangedTick =
@@ -478,7 +478,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 bool do_push, do_wb_v;
                 std::tie(delta, do_push, do_wb_v) =
                                         graphWorkload->prePushApply(items[i]);
-                std::cout << "CoalesceEngine: delta: " << delta << std::endl;
                 do_wb |= do_wb_v;
                 if (do_push) {
                     owner->recvVertexPush(vertex_addr, delta,
@@ -517,7 +516,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         cacheBlocks[block_index].valid = true;
         cacheBlocks[block_index].needsWB |= do_wb;
         cacheBlocks[block_index].pendingData = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
+        // HACK: In case processNextRead is called on the same tick as curTick
+        // and is scheduled to read to the same cacheBlocks[block_index]
+        cacheBlocks[block_index].lastChangedTick =
+                                        curTick() + (Tick) (clockPeriod() / 2);
     } else if (do_wb) {
         PacketPtr wb_pkt = createWritePacket(
                                 addr, peerMemoryAtomSize, (uint8_t*) items);
@@ -564,7 +566,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         responseQueue.size());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-            cacheBlocks[block_index].lastChangedTick = curTick();
+            // cacheBlocks[block_index].lastChangedTick = curTick();
             DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
             it = MSHR[block_index].erase(it);
@@ -608,8 +610,8 @@ CoalesceEngine::processNextResponseEvent()
         num_responses_sent++;
         DPRINTF(CoalesceEngine,
                     "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
-                    __func__, 
-                    graphWorkload->printWorkListItem(worklist_response), 
+                    __func__,
+                    graphWorkload->printWorkListItem(worklist_response),
                     addr_response);
 
         responseQueue.pop_front();
@@ -652,7 +654,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
     DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
-                "with Addr: %lu.\n", __func__, 
+                "with Addr: %lu.\n", __func__,
                 graphWorkload->printWorkListItem(wl), addr);
     // Desing does not allow for write misses for now.
     assert(cacheBlocks[block_index].addr == aligned_addr);
@@ -874,8 +876,11 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
         __func__, block_index, cacheBlocks[block_index].to_string());
     // A cache block should not be touched while it's waiting for data.
-    assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
-    //
+    // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
+
+    if (cacheBlocks[block_index].lastChangedTick != schedule_tick) {
+        return;
+    }
 
     assert(!cacheBlocks[block_index].valid);
     assert(cacheBlocks[block_index].busyMask == 0);
@@ -1124,7 +1129,6 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             bool do_push, do_wb;
             std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
                                     cacheBlocks[block_index].items[wl_offset]);
-            std::cout << "CoalesceEngine: delta: " << delta << std::endl;
             cacheBlocks[block_index].needsWB |= do_wb;
             if (do_push) {
                 owner->recvVertexPush(vertex_addr, delta,

From ffbef8e2cf85c635d8814ccf1951ea145a968fb6 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 26 Oct 2022 07:46:18 -0700
Subject: [PATCH 206/247] Fixing typos.

---
 configs/accl/bfs.py                   | 8 ++++----
 configs/accl/sega.py                  | 2 +-
 src/accl/graph/base/graph_workload.cc | 1 -
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index d02faa96ca..fc32b96642 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -37,8 +37,8 @@ def get_inputs():
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=float)
-    argparser.add_argument("init_value", type=float)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
     argparser.add_argument(
         "--verify",
         dest="verify",
@@ -54,8 +54,8 @@ def get_inputs():
         args.num_gpts,
         args.cache_size,
         args.graph,
-        args.alpha,
-        args.threshold,
+        args.init_addr,
+        args.init_value,
         args.verify,
     )
 
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 42c07e2e94..0f4b133791 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -59,7 +59,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
-            Xpush_req_queue_size=32,
+            push_req_queue_size=32,
             attached_memory_atom_size=64,
             resp_queue_size=512,
             update_queue_size=32,
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index e362d605c0..44136cb4c1 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -175,7 +175,6 @@ PRWorkload::propagate(uint32_t value, uint32_t weight)
 {
     float value_float = writeToFloat<uint32_t>(value);
     float weight_float = 1.0;
-    float delta = alpha * value_float * weight_float;
 
     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
 }

From fe146055cc230e532d878a66cd0c1577a81234f3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 27 Oct 2022 14:24:18 -0700
Subject: [PATCH 207/247] Adding sample script.

---
 configs/accl/pr-sample.py              | 109 +++++++++++++++++++++++++
 src/accl/graph/sega/coalesce_engine.cc |   2 +-
 2 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 configs/accl/pr-sample.py

diff --git a/configs/accl/pr-sample.py b/configs/accl/pr-sample.py
new file mode 100644
index 0000000000..ac3616dc84
--- /dev/null
+++ b/configs/accl/pr-sample.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from sega import SEGA
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 10us",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.alpha,
+        args.threshold,
+        args.verify,
+        args.sample,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        alpha,
+        threshold,
+        verify,
+        sample,
+    ) = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.create_pr_workload(alpha, threshold)
+
+    if sample:
+        while True:
+            exit_event = m5.simulate(10000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            print(exit_event.getCause())
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 2f6555602c..1dbe2a0d56 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -519,7 +519,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         // HACK: In case processNextRead is called on the same tick as curTick
         // and is scheduled to read to the same cacheBlocks[block_index]
         cacheBlocks[block_index].lastChangedTick =
-                                        curTick() + (Tick) (clockPeriod() / 2);
+                                        curTick() - (Tick) (clockPeriod() / 2);
     } else if (do_wb) {
         PacketPtr wb_pkt = createWritePacket(
                                 addr, peerMemoryAtomSize, (uint8_t*) items);

From 151a02fbe697abb0713b99c0ff72fa4f16bf63b1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 28 Oct 2022 11:02:32 -0700
Subject: [PATCH 208/247] Fixing sim performance issue.

---
 src/accl/graph/base/graph_workload.cc  |  8 ++++++--
 src/accl/graph/base/graph_workload.hh  |  9 ++++++---
 src/accl/graph/sega/coalesce_engine.cc |  7 +++++--
 src/accl/graph/sega/coalesce_engine.hh | 18 ++++++++++++++++--
 4 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 44136cb4c1..07accff44f 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -68,7 +68,8 @@ BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size)
 void
 BFSWorkload::init(PacketPtr pkt, int bit_index_base,
                 std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                std::deque<int>& activeBits)
+                std::deque<int>& activeBits,
+                int& _workCount)
 {
     if (pkt->getAddr() == initAddrBase) {
         WorkListItem items[numElementsPerLine];
@@ -80,6 +81,7 @@ BFSWorkload::init(PacketPtr pkt, int bit_index_base,
         if (items[initIndex].degree > 0) {
             needsPush[bit_index_base + initIndex] = 1;
             activeBits.push_back(bit_index_base + initIndex);
+            _workCount++;
         }
 
         pkt->deleteData();
@@ -144,7 +146,8 @@ PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
 void
 PRWorkload::init(PacketPtr pkt, int bit_index_base,
                 std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                std::deque<int>& activeBits)
+                std::deque<int>& activeBits,
+                int& _workCount)
 {
     WorkListItem items[numElementsPerLine];
 
@@ -155,6 +158,7 @@ PRWorkload::init(PacketPtr pkt, int bit_index_base,
         if (items[i].degree > 0) {
             needsPush[bit_index_base + i] = 1;
             activeBits.push_back(bit_index_base + i);
+            _workCount++;
         }
     }
     pkt->deleteData();
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index c391a80c23..6bbc4935c2 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -48,7 +48,8 @@ class GraphWorkload
 
     virtual void init(PacketPtr pkt, int bit_index_base,
                     std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits) = 0;
+                    std::deque<int>& activeBits,
+                    int& _workCount) = 0;
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual bool applyCondition(WorkListItem wl) = 0;
@@ -73,7 +74,8 @@ class BFSWorkload : public GraphWorkload
 
     virtual void init(PacketPtr pkt, int bit_index_base,
                     std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits);
+                    std::deque<int>& activeBits,
+                    int& _workCount);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual bool applyCondition(WorkListItem wl);
@@ -99,7 +101,8 @@ class PRWorkload : public GraphWorkload
 
     virtual void init(PacketPtr pkt, int bit_index_base,
                     std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits);
+                    std::deque<int>& activeBits,
+                    int& _workCount);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual bool applyCondition(WorkListItem wl);
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 1dbe2a0d56..38f05f937a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -47,7 +47,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
-    maxRespPerCycle(params.max_resp_per_cycle),
+    maxRespPerCycle(params.max_resp_per_cycle), _workCount(0),
     numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size),
     maxPotentialPostPushWB(0),
     nextMemoryEvent([this] {
@@ -102,7 +102,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
     } else {
         // TODO: Add and implement init function for GraphWorkload.
         int bit_index_base = getBitIndexBase(pkt->getAddr());
-        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits);
+        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount);
         memPort.sendFunctional(pkt);
     }
 }
@@ -473,6 +473,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             if (vertex_send_mask != 0) {
                 assert(needsPush[it + i] == 1);
                 needsPush[it + i] = 0;
+                _workCount--;
 
                 uint32_t delta;
                 bool do_push, do_wb_v;
@@ -784,6 +785,7 @@ CoalesceEngine::processNextPreWBApplyEvent()
                 int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
                 if (needsPush[bit_index_base + index] == 0) {
                     needsPush[bit_index_base + index] = 1;
+                    _workCount++;
                     activeBits.push_back(bit_index_base + index);
                     if (!owner->running()) {
                         owner->start();
@@ -1124,6 +1126,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             int slice_base_index = getBitIndexBase(addr);
 
             needsPush[slice_base_index + wl_offset] = 0;
+            _workCount--;
 
             uint32_t delta;
             bool do_push, do_wb;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c8fec38e5b..64c5c4af46 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -52,6 +52,17 @@ enum BitStatus
     NUM_STATUS
 };
 
+enum CacheState
+{
+    INVALID,
+    PENDING_DATA,
+    BUSY,
+    IDLE,
+    PENDING_PRE_WB_APPLY,
+    PENDING_WB,
+    NUM_CACHE_STATE
+};
+
 class MPU;
 
 class CoalesceEngine : public BaseMemoryEngine
@@ -69,6 +80,7 @@ class CoalesceEngine : public BaseMemoryEngine
         bool pendingApply;
         bool pendingWB;
         Tick lastChangedTick;
+        CacheState state;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block() {}
@@ -81,7 +93,8 @@ class CoalesceEngine : public BaseMemoryEngine
           pendingData(false),
           pendingApply(false),
           pendingWB(false),
-          lastChangedTick(0)
+          lastChangedTick(0),
+          state(CacheState::INVALID)
         {
           items = new WorkListItem [num_elements];
         }
@@ -116,6 +129,7 @@ class CoalesceEngine : public BaseMemoryEngine
     int maxRespPerCycle;
     std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
+    int _workCount;
     int numPullsReceived;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
@@ -206,7 +220,7 @@ class CoalesceEngine : public BaseMemoryEngine
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
-    int workCount() { return needsPush.count(); }
+    int workCount() { return _workCount; }
     void recvVertexPull();
 
     bool done();

From 82d076c4bc2efca79614cb40f08ec080bd8ac7ac Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 31 Oct 2022 09:53:00 -0700
Subject: [PATCH 209/247] Fixing write miss issue.

---
 src/accl/graph/sega/coalesce_engine.cc | 92 ++++++++++++++------------
 src/accl/graph/sega/coalesce_engine.hh | 30 ++++++++-
 2 files changed, 76 insertions(+), 46 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 38f05f937a..7a064c1c2f 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -495,6 +495,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         maxPotentialPostPushWB--;
     }
 
+    bool cache_wb = false;
     if (cacheBlocks[block_index].addr == addr) {
         DPRINTF(CoalesceEngine, "%s: Received read response to "
                         "fill cacheBlocks[%d].\n", __func__, block_index);
@@ -521,6 +522,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         // and is scheduled to read to the same cacheBlocks[block_index]
         cacheBlocks[block_index].lastChangedTick =
                                         curTick() - (Tick) (clockPeriod() / 2);
+        cache_wb = true;
     } else if (do_wb) {
         PacketPtr wb_pkt = createWritePacket(
                                 addr, peerMemoryAtomSize, (uint8_t*) items);
@@ -537,42 +539,44 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         // TODO: Add a stat to count this.
         // FIXME: This is not a totally wasteful read. e.g. all reads
         // for pull in BFS are like this.
-        DPRINTF(CoalesceEngine, "%s: Totally wasteful read.\n", __func__);
+        DPRINTF(CoalesceEngine, "%s: No write destination for addr: %lu.\n", __func__, addr);
     }
 
-    for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
-        Addr miss_addr = *it;
-        Addr aligned_miss_addr =
-            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-
-        if (aligned_miss_addr == addr) {
-            int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
-                        "cacheBlocks[%d] can be serviced with the received "
-                        "packet.\n",__func__, miss_addr, block_index);
-            // TODO: Make this block of code into a function
-            responseQueue.push_back(std::make_tuple(miss_addr,
-                    cacheBlocks[block_index].items[wl_offset], curTick()));
-            DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d.\n",
-                        __func__, miss_addr,
-                        graphWorkload->printWorkListItem(
-                            cacheBlocks[block_index].items[wl_offset]),
-                        responseQueue.size());
-            DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d.\n",
-                        __func__, addr,
-                        graphWorkload->printWorkListItem(
-                            cacheBlocks[block_index].items[wl_offset]),
-                        responseQueue.size());
-            // TODO: Add a stat to count the number of WLItems that have been touched.
-            cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-            // cacheBlocks[block_index].lastChangedTick = curTick();
-            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-            it = MSHR[block_index].erase(it);
-        } else {
-            it++;
+    if (cache_wb) {
+        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+            Addr miss_addr = *it;
+            Addr aligned_miss_addr =
+                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
+            if (aligned_miss_addr == addr) {
+                int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+                DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                            "cacheBlocks[%d] can be serviced with the received "
+                            "packet.\n",__func__, miss_addr, block_index);
+                // TODO: Make this block of code into a function
+                responseQueue.push_back(std::make_tuple(miss_addr,
+                        cacheBlocks[block_index].items[wl_offset], curTick()));
+                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, miss_addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                // TODO: Add a stat to count the number of WLItems that have been touched.
+                cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+                // cacheBlocks[block_index].lastChangedTick = curTick();
+                DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                            block_index, cacheBlocks[block_index].to_string());
+                it = MSHR[block_index].erase(it);
+            } else {
+                it++;
+            }
         }
     }
 
@@ -1045,7 +1049,7 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
     }
 }
 
-std::tuple<BitStatus, Addr, int>
+std::tuple<WorkLocation, Addr, int>
 CoalesceEngine::getOptimalPullAddr()
 {
     int visited_bits = 0;
@@ -1066,7 +1070,7 @@ CoalesceEngine::getOptimalPullAddr()
             assert(vertex_send_mask == 0);
             activeBits.pop_front();
             return std::make_tuple(
-                                BitStatus::PENDING_READ, addr, index_offset);
+                                WorkLocation::PENDING_READ, addr, index_offset);
         } else {
             // Only if it is in cache and it is in idle state.
             if ((cacheBlocks[block_index].addr == addr) &&
@@ -1078,12 +1082,12 @@ CoalesceEngine::getOptimalPullAddr()
                 assert(!cacheBlocks[block_index].pendingData);
                 activeBits.pop_front();
                 return std::make_tuple(
-                            BitStatus::IN_CACHE, block_index, index_offset);
+                            WorkLocation::IN_CACHE, block_index, index_offset);
             // Otherwise if it is in memory
             } else if ((cacheBlocks[block_index].addr != addr)) {
                 activeBits.pop_front();
                 return std::make_tuple(
-                            BitStatus::IN_MEMORY, addr, index_offset);
+                            WorkLocation::IN_MEMORY, addr, index_offset);
             }
         }
         activeBits.pop_front();
@@ -1091,20 +1095,20 @@ CoalesceEngine::getOptimalPullAddr()
         visited_bits++;
     }
 
-    return std::make_tuple(BitStatus::GARBAGE, 0, 0);
+    return std::make_tuple(WorkLocation::GARBAGE, 0, 0);
 }
 
 void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
-    BitStatus bit_status;
+    WorkLocation bit_status;
     Addr location;
     int offset;
 
     std::tie(bit_status, location, offset) = getOptimalPullAddr();
 
-    if (bit_status != BitStatus::GARBAGE) {
-        if (bit_status == BitStatus::PENDING_READ) {
+    if (bit_status != WorkLocation::GARBAGE) {
+        if (bit_status == WorkLocation::PENDING_READ) {
             // renaming the outputs to thier local names.
             Addr addr = location;
             int index_offset = offset;
@@ -1116,7 +1120,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             pendingVertexPullReads[addr] = send_mask;
             numPullsReceived--;
         }
-        if (bit_status == BitStatus::IN_CACHE) {
+        if (bit_status == WorkLocation::IN_CACHE) {
             // renaming the outputs to their local names.
             int block_index = (int) location;
             int wl_offset = offset;
@@ -1145,7 +1149,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             stats.lastVertexPushTime = curTick() - stats.lastResetTick;
             numPullsReceived--;
         }
-        if (bit_status == BitStatus::IN_MEMORY) {
+        if (bit_status == WorkLocation::IN_MEMORY) {
             if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) {
                 Addr addr = location;
                 int index_offset = offset;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 64c5c4af46..05e268270a 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -43,7 +43,7 @@
 namespace gem5
 {
 
-enum BitStatus
+enum WorkLocation
 {
     PENDING_READ,
     IN_CACHE,
@@ -65,6 +65,32 @@ enum CacheState
 
 class MPU;
 
+
+// TODO: Add active bit to WorkListItem class. Check active bit before activate
+// Only activate if necessary and not active before.
+class WorkDirectory
+{
+  private:
+    CoalesceEngine* owner;
+    Addr memoryAtomSize;
+    int atomBlockSize;
+    size_t elementSize;
+
+    int _workCount;
+  public:
+    AddrRange memoryRange;
+    WorkDirectory(Addr atom_size, int block_size, size_t element_size):
+        memoryAtomSize(atom_size), atomBlockSize(block_size),
+        elementSize(element_size), _workCount(0)
+    {}
+
+    void activate(Addr addr);
+    void deactivate(Addr addr);
+    int workCount();
+    std::tuple<WorkLocation, Addr> getNextWork();
+
+};
+
 class CoalesceEngine : public BaseMemoryEngine
 {
   private:
@@ -140,7 +166,7 @@ class CoalesceEngine : public BaseMemoryEngine
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
-    std::tuple<BitStatus, Addr, int> getOptimalPullAddr();
+    std::tuple<WorkLocation, Addr, int> getOptimalPullAddr();
 
     int maxPotentialPostPushWB;
     // A map from addr to sendMask. sendMask determines which bytes to

From f217715d8eae9774027635e6652755cdeaab0c00 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 1 Nov 2022 00:15:16 -0700
Subject: [PATCH 210/247] Restructuring the cache.

---
 src/accl/graph/base/data_structs.hh        |   17 +-
 src/accl/graph/sega/CoalesceEngine.py      |    2 -
 src/accl/graph/sega/CoalesceEngine_bak.py  |   50 +
 src/accl/graph/sega/coalesce_engine.cc     |  553 +++------
 src/accl/graph/sega/coalesce_engine.hh     |  107 +-
 src/accl/graph/sega/coalesce_engine_bak.cc | 1308 ++++++++++++++++++++
 src/accl/graph/sega/coalesce_engine_bak.hh |  218 ++++
 7 files changed, 1834 insertions(+), 421 deletions(-)
 create mode 100644 src/accl/graph/sega/CoalesceEngine_bak.py
 create mode 100644 src/accl/graph/sega/coalesce_engine_bak.cc
 create mode 100644 src/accl/graph/sega/coalesce_engine_bak.hh

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index d9028e2f10..070e635736 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -45,29 +45,33 @@ struct __attribute__ ((packed)) WorkListItem
 {
     uint32_t tempProp : 32;
     uint32_t prop : 32;
-    uint32_t degree : 32;
     uint32_t edgeIndex : 32;
+    uint32_t degree : 31;
+    bool active: 1;
 
     std::string to_string()
     {
         return csprintf(
-        "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-        tempProp, prop, degree, edgeIndex);
+                "WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, "
+                "degree: %u, active: %s}", tempProp, prop, edgeIndex, degree,
+                active ? "true" : "false");
     }
 
     WorkListItem():
         tempProp(0),
         prop(0),
+        edgeIndex(0),
         degree(0),
-        edgeIndex(0)
+        active(false)
     {}
 
     WorkListItem(uint32_t temp_prop, uint32_t prop,
-                uint32_t degree, uint32_t edge_index):
+                uint32_t edge_index, uint32_t degree, bool active):
         tempProp(temp_prop),
         prop(prop),
+        edgeIndex(edge_index),
         degree(degree),
-        edgeIndex(edge_index)
+        active(active)
     {}
 
 };
@@ -88,7 +92,6 @@ struct __attribute__ ((packed)) Edge
         weight(weight),
         neighbor(neighbor)
     {}
-
 };
 
 static_assert(isPowerOf2(sizeof(WorkListItem)));
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 1fd3b968c5..8ec9214b49 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -38,8 +38,6 @@ class CoalesceEngine(BaseMemoryEngine):
 
     num_mshr_entry = Param.Int("Number of MSHR entries.")
 
-    num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
-
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
 
diff --git a/src/accl/graph/sega/CoalesceEngine_bak.py b/src/accl/graph/sega/CoalesceEngine_bak.py
new file mode 100644
index 0000000000..1fd3b968c5
--- /dev/null
+++ b/src/accl/graph/sega/CoalesceEngine_bak.py
@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
+
+class CoalesceEngine(BaseMemoryEngine):
+    type = 'CoalesceEngine'
+    cxx_header = "accl/graph/sega/coalesce_engine.hh"
+    cxx_class = 'gem5::CoalesceEngine'
+
+    cache_size = Param.MemorySize("Size of the internal SRAM array.")
+
+    num_mshr_entry = Param.Int("Number of MSHR entries.")
+
+    num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
+
+    max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
+                                "requestor in each cycle. Used to limit b/w.")
+
+    post_push_wb_queue_size = Param.Int("Maximum number of pending wb after "
+                                "apply process for applications that require "
+                                "the apply process to happen exactly before "
+                                "pushing the edgePointer to the PushEngine.")
+
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 7a064c1c2f..66ff66c068 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -46,10 +46,10 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
-    numTgtsPerMSHR(params.num_tgts_per_mshr),
-    maxRespPerCycle(params.max_resp_per_cycle), _workCount(0),
-    numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size),
-    maxPotentialPostPushWB(0),
+    maxRespPerCycle(params.max_resp_per_cycle), cacheWorkCount(0),
+    numPullsReceived(0), activeBufferSize(params.post_push_wb_queue_size),
+    postPushWBQueueSize(params.post_push_wb_queue_size),
+    pendingPullReads(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -59,6 +59,9 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     nextPreWBApplyEvent([this] {
         processNextPreWBApplyEvent();
         }, name() + ".nextPreWBApplyEvent"),
+    nextPrePushApplyEvent([this] {
+        processNextPrePushApplyEvent();
+        }, name() + ".nextPrePushApplyEvent"),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -66,7 +69,6 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
-    needsPush.reset();
 }
 
 void
@@ -83,15 +85,10 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
         Addr addr = pkt->getAddr();
         int block_index = getBlockIndex(addr);
 
+        // TODO: Check postPushWBQueue for hits
         if ((cacheBlocks[block_index].addr == addr) &&
             (cacheBlocks[block_index].valid)) {
-            assert(cacheBlocks[block_index].busyMask == 0);
-            assert(!cacheBlocks[block_index].needsApply);
-            // NOTE: No need to check needsWB because there might be entries
-            // that have been updated and not written back in the cache.
-            // assert(!cacheBlocks[block_index].needsWB);
-            assert(!cacheBlocks[block_index].pendingApply);
-            assert(!cacheBlocks[block_index].pendingWB);
+            assert(cacheBlocks[block_index].state == CacheState::IDLE);
 
             pkt->makeResponse();
             pkt->setDataFromBlock(
@@ -100,8 +97,8 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
             memPort.sendFunctional(pkt);
         }
     } else {
-        // TODO: Add and implement init function for GraphWorkload.
         int bit_index_base = getBitIndexBase(pkt->getAddr());
+        // FIXME: Pass workdirectory to graphworkload.init
         graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount);
         memPort.sendFunctional(pkt);
     }
@@ -110,6 +107,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
 bool
 CoalesceEngine::done()
 {
+    // FIXME: Fix this later
     return applyQueue.empty() && needsPush.none() &&
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
@@ -123,6 +121,8 @@ CoalesceEngine::getBlockIndex(Addr addr)
     return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
 }
 
+// FIXME: This and the next function should be moved to the
+// WorkDirectory.
 // addr should be aligned to peerMemoryAtomSize
 int
 CoalesceEngine::getBitIndexBase(Addr addr)
@@ -134,6 +134,7 @@ CoalesceEngine::getBitIndexBase(Addr addr)
     return atom_index * block_bits;
 }
 
+// FIXME: Read FIXME: Above
 // index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
 Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
@@ -161,17 +162,10 @@ CoalesceEngine::recvWLRead(Addr addr)
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
+        // Hit
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
         stats.readHits++;
-        assert(!cacheBlocks[block_index].pendingData);
-        // No cache block could be in pendingApply and pendingWB at the
-        // same time.
-        assert(!(cacheBlocks[block_index].pendingApply &&
-                cacheBlocks[block_index].pendingWB));
-        // Hit
-        // TODO: Add a hit latency as a param for this object.
-        // Can't just schedule the nextResponseEvent for latency cycles in
-        // the future.
+        assert(cacheBlocks[block_index].state != CacheState::INVALID);
         responseQueue.push_back(std::make_tuple(
             addr, cacheBlocks[block_index].items[wl_offset], curTick()));
 
@@ -189,12 +183,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                 responseQueue.size());
         // TODO: Stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-        // If they are scheduled for apply and WB those schedules should be
-        // discarded. Since there is no easy way to take items out of the
-        // function queue. Those functions check for their respective bits
-        // and skip the process if the respective bit is set to false.
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].pendingWB = false;
+        cacheBlocks[block_index].state = CacheState::BUSY;
         // HACK: If a read happens on the same cycle as another operation such
         // as apply set lastChangedTick to half a cycle later so that operation
         // scheduled by the original operation (apply in this example) are
@@ -210,34 +199,20 @@ CoalesceEngine::recvWLRead(Addr addr)
         stats.numVertexReads++;
         return true;
     } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
-                (cacheBlocks[block_index].pendingData)) {
+                (cacheBlocks[block_index].state == CacheState::PENDING_DATA)) {
         // Hit under miss
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit under miss.\n",
                                                         __func__, addr);
         stats.readHitUnderMisses++;
         assert(!cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
-        assert(!cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(!cacheBlocks[block_index].pendingWB);
+        assert(!cacheBlocks[block_index].dirty);
+        assert(!cacheBlocks[block_index].needsPreWBApply);
 
         assert(MSHR.size() <= numMSHREntries);
         assert(MSHR.find(block_index) != MSHR.end());
-        assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-        if (MSHR[block_index].size() == numTgtsPerMSHR) {
-            DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-                        "cacheBlocks[%d]. Rejecting request.\n",
-                                        __func__, block_index);
-            stats.mshrTargetShortage++;
-            return false;
-        } else {
-            DPRINTF(CoalesceEngine,  "%s: MSHR entries are available for "
-                            "cacheBlocks[%d].\n", __func__, block_index);
-        }
         MSHR[block_index].push_back(addr);
-        stats.mshrEntryLength.sample(MSHR[block_index].size());
-        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to MSHR "
                 "for cacheBlocks[%d].\n", __func__, addr, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
@@ -245,195 +220,52 @@ CoalesceEngine::recvWLRead(Addr addr)
         return true;
     } else {
         // miss
-        // FIXME: Make this assert work. It will break if the cache block
-        // is cold and addr or aligned_addr is 0. It fails because cache block
-        // addr field is initialized to 0. Unfortunately Addr type is unsigned.
-        // So you can not initialized addr to -1.
         assert(cacheBlocks[block_index].addr != aligned_addr);
         assert(MSHR.size() <= numMSHREntries);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
-        if (MSHR.find(block_index) == MSHR.end()) {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
-                    " %lu not found in MSHRs.\n", __func__, block_index, addr);
-            if (MSHR.size() == numMSHREntries) {
-                // Out of MSHR entries
-                DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
-                                "Rejecting request.\n", __func__);
-                // TODO: Break out read rejections into more than one stat
-                // based on the cause of the rejection
-                stats.mshrEntryShortage++;
-                return false;
-            } else {
-                DPRINTF(CoalesceEngine,  "%s: MSHR "
-                    "entries available.\n", __func__);
-                if ((cacheBlocks[block_index].valid) ||
-                    (cacheBlocks[block_index].pendingData)) {
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
-                                "with Addr: %lu.\n", __func__, addr,
-                                cacheBlocks[block_index].addr);
-                    if ((cacheBlocks[block_index].valid) &&
-                        (cacheBlocks[block_index].busyMask == 0) &&
-                        (!cacheBlocks[block_index].pendingApply) &&
-                        (!cacheBlocks[block_index].pendingWB)) {
-                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                                    "idle state.\n", __func__, block_index);
-                        // We're in idle state
-                        // Idle: valid && !pendingApply && !pendingWB;
-                        // Note 0: needsApply has to be false. Because
-                        // A cache line enters the idle state from two
-                        // other states. First a busy state that does not
-                        // need apply (needsApply is already false) or
-                        // from pendingApplyState after being applied which
-                        // clears the needsApply bit. needsApply is useful
-                        // when a cache block has transitioned from
-                        // pendingApply to busy without the apply happening.
-                        // Note 1: pendingData does not have to be evaluated
-                        // becuase pendingData is cleared when data
-                        // arrives from the memory and valid does not
-                        // denote cleanliness of the line. Rather it
-                        // is used to differentiate between empty blocks
-                        // and the blocks that have data from memory.
-                        // pendingData denotes the transient state between
-                        // getting a miss and getting the data for that miss.
-                        // valid basically means that the data in the cache
-                        // could be used to respond to read/write requests.
-                        assert(!cacheBlocks[block_index].needsApply);
-                        assert(!cacheBlocks[block_index].pendingData);
-                        // There are no conflicts in idle state.
-                        assert(MSHR.find(block_index) == MSHR.end());
-                        if (cacheBlocks[block_index].needsWB) {
-                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs"
-                            "to be written back.\n", __func__, block_index);
-                            cacheBlocks[block_index].pendingWB = true;
-                            cacheBlocks[block_index].lastChangedTick = curTick();
-                            memoryFunctionQueue.emplace_back(
-                                [this] (int block_index, Tick schedule_tick) {
-                                processNextWriteBack(block_index, schedule_tick);
-                            }, block_index, curTick());
-                            DPRINTF(CoalesceEngine, "%s: Pushed "
-                                        "processNextWriteBack for input "
-                                        "%d to memoryFunctionQueue.\n",
-                                        __func__, block_index);
-                            if ((!nextMemoryEvent.pending()) &&
-                                (!nextMemoryEvent.scheduled())) {
-                                schedule(nextMemoryEvent, nextCycle());
-                            }
-                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
-                                    "%s.\n", __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                        } else {
-                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does "
-                                            "not need to be written back.\n",
-                                                        __func__, block_index);
-                            cacheBlocks[block_index].addr = aligned_addr;
-                            cacheBlocks[block_index].valid = false;
-                            cacheBlocks[block_index].busyMask = 0;
-                            cacheBlocks[block_index].needsWB = false;
-                            cacheBlocks[block_index].needsApply = false;
-                            cacheBlocks[block_index].pendingData = true;
-                            cacheBlocks[block_index].pendingApply = false;
-                            cacheBlocks[block_index].pendingWB = false;
-                            cacheBlocks[block_index].lastChangedTick = curTick();
-                            memoryFunctionQueue.emplace_back(
-                                [this] (int block_index, Tick schedule_tick) {
-                                    processNextRead(block_index, schedule_tick);
-                                }, block_index, curTick());
-                            DPRINTF(CoalesceEngine, "%s: Pushed "
-                                        "processNextRead for input "
-                                        "%d to memoryFunctionQueue.\n",
-                                        __func__, block_index);
-                            if ((!nextMemoryEvent.pending()) &&
-                                (!nextMemoryEvent.scheduled())) {
-                                schedule(nextMemoryEvent, nextCycle());
-                            }
-                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
-                                    "%s.\n", __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                        }
-                    }
-                    // cacheBlocks[block_index].hasConflict = true;
-                    MSHR[block_index].push_back(addr);
-                    stats.mshrEntryLength.sample(MSHR[block_index].size());
-                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
-                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
-                    stats.readMisses++;
-                    // TODO: Add readConflicts here.
-                    stats.numVertexReads++;
-                    return true;
-                } else {
-                    // MSHR available and no conflict
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
-                                            "Allocating a cache line for it.\n"
-                                                            , __func__, addr);
-                    assert(!cacheBlocks[block_index].valid);
-                    assert(cacheBlocks[block_index].busyMask == 0);
-                    assert(!cacheBlocks[block_index].needsWB);
-                    assert(!cacheBlocks[block_index].needsApply);
-                    assert(!cacheBlocks[block_index].pendingData);
-                    assert(!cacheBlocks[block_index].pendingApply);
-                    assert(!cacheBlocks[block_index].pendingWB);
-                    assert(MSHR[block_index].size() == 0);
-
-                    cacheBlocks[block_index].addr = aligned_addr;
-                    cacheBlocks[block_index].busyMask = 0;
-                    cacheBlocks[block_index].valid = false;
-                    cacheBlocks[block_index].needsWB = false;
-                    cacheBlocks[block_index].needsApply = false;
-                    cacheBlocks[block_index].pendingData = true;
-                    cacheBlocks[block_index].pendingApply = false;
-                    cacheBlocks[block_index].pendingWB = false;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
-                                " Addr: %lu.\n", __func__, block_index, addr);
-                    MSHR[block_index].push_back(addr);
-                    stats.mshrEntryLength.sample(MSHR[block_index].size());
-                    DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
-                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
+
+        if (cacheBlocks[block_index].state != CacheState::INVALID) {
+            // conflict miss
+            DPRINTF(CoalesceEngine, "%s: Addr: %lu has conflict with "
+                "Addr: %lu.\n", __func__, addr, cacheBlocks[block_index].addr);
+            cacheBlocks[block_index].hasConflict = true;
+            if (cacheBlocks[block_index].state == CacheState::IDLE) {
+                if (cacheBlocks[block_index].dirty) {
+                    cacheBlocks[block_index].state = CacheState::PENDING_WB;
                     memoryFunctionQueue.emplace_back(
                         [this] (int block_index, Tick schedule_tick) {
-                            processNextRead(block_index, schedule_tick);
+                            processNextWriteBack(block_index, schedule_tick);
                         }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
-                                        "input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-                                    __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                    stats.readMisses++;
-                    stats.numVertexReads++;
-                    return true;
+                } else {
+                    // NOTE: move the cache block to invalid state
+                    // FIXME: Fix the issue below.
+                    // May need to activate tracking for this
+                    cacheBlocks[block_index].reset();
                 }
             }
+            // return int instead of bool to tell WLEngine to whether
+            // roll the first entry in the queue.
+            return false;
         } else {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
-                "Addr: %lu already in MSHRs. It has a conflict "
-                "with addr: %lu.\n", __func__, block_index, addr,
-                                cacheBlocks[block_index].addr);
-            assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-            assert(MSHR[block_index].size() > 0);
-            if (MSHR[block_index].size() == numTgtsPerMSHR) {
-                DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-                            "cacheBlocks[%d]. Rejecting request.\n",
-                                            __func__, block_index);
-                stats.mshrTargetShortage++;
+            // cold miss
+            assert(MSHR.find(block_index) == MSHR.end());
+            if (MSHR.size() < numMSHREntries) {
+                cacheBlocks[block_index].addr = aligned_addr;
+                cacheBlocks[block_index].busyMask = 0;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].dirty = false;
+                cacheBlocks[block_index].hasConflict = false;
+                cacheBlocks[block_index].needsPreWBApply = false;
+                cacheBlocks[block_index].state = CacheState::PENDING_DATA;
+                cacheBlocks[block_index].lastChangedTick = curTick();
+                memoryFunctionQueue.emplace_back(
+                    [this] (int block_index, Tick schedule_tick) {
+                        processNextRead(block_index, schedule_tick);
+                    }, block_index, curTick());
+                return true;
+            } else {
                 return false;
             }
-            DPRINTF(CoalesceEngine, "%s: There is room for another target "
-                            "for cacheBlocks[%d].\n", __func__, block_index);
-
-            // TODO: Might want to differentiate between different misses.
-            stats.readMisses++;
-
-            MSHR[block_index].push_back(addr);
-            stats.mshrEntryLength.sample(MSHR[block_index].size());
-            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
-                            "cacheBlocks[%d].\n", __func__, addr, block_index);
-            stats.numVertexReads++;
-            return true;
         }
     }
 }
@@ -589,8 +421,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         schedule(nextResponseEvent, nextCycle());
     }
 
-
-    // TODO: Probably check for done here too.
     delete pkt;
     return true;
 }
@@ -771,15 +601,53 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextPreWBApplyEvent()
 {
-    int block_index = applyQueue.front();
-    DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
+    int block_index = preWBApplyQueue.front();
+    DPRINTF(CoalesceEngine, "%s: Looking at the front of the preWBApplyQueue. "
                 "cacheBlock[%d] to be applied.\n", __func__, block_index);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
             __func__, block_index, cacheBlocks[block_index].to_string());
-    assert(cacheBlocks[block_index].valid);
-    assert(cacheBlocks[block_index].needsApply);
-    assert(!cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingWB);
+
+    if (cacheBlocks[block_index].state == CacheState::PENDING_PRE_WB_APPLY) {
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].needsPreWBApply);
+        bool block_active = false;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            bool active = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
+            block_active |= active;
+            if (active) {
+                // cacheWorkCount++;
+                // FUTUREME: When pulling from activeCacheBlocks, in case we
+                // face a block that is not in idle state, we basically pop
+                // that entry and push it to the back. We only delete entries
+                // in this buffer if pushed or evicted.
+                activeCacheBlocks.push_back(block_index);
+            }
+        }
+        if (block_active && !owner->running()) {
+            owner->start();
+        }
+
+        cacheBlocks[block_index].needsPreWBApply = false;
+        if (cacheBlocks[block_index].hasConflict) {
+            if (cacheBlocks[block_index].dirty) {
+                memoryFunctionQueue.emplace_back(
+                    [this] (int block_index, Tick schedule_tick) {
+                        processNextWriteBack(block_index, schedule_tick);
+                    }, block_index, curTick());
+            } else {
+                // FIXME: Solve below issue.
+                // Not dirty but could be active still.
+                // need to activate tracking
+                cacheBlocks[block_index].reset();
+            }
+        } else {
+            cacheBlocks[block_index].state = CacheState::IDLE;
+        }
+        cacheBlocks[block_index].lastChangedTick = curTick();
+    } else {
+
+    }
 
     if (cacheBlocks[block_index].pendingApply) {
         assert(cacheBlocks[block_index].busyMask == 0);
@@ -883,77 +751,85 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
         __func__, block_index, cacheBlocks[block_index].to_string());
     // A cache block should not be touched while it's waiting for data.
     // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
-
+    // TODO: Figure out if this is still necessary.
     if (cacheBlocks[block_index].lastChangedTick != schedule_tick) {
         return;
     }
 
-    assert(!cacheBlocks[block_index].valid);
     assert(cacheBlocks[block_index].busyMask == 0);
-    assert(!cacheBlocks[block_index].needsWB);
-    assert(!cacheBlocks[block_index].needsApply);
-    assert(cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingApply);
-    assert(!cacheBlocks[block_index].pendingWB);
+    assert(!cacheBlocks[block_index].valid);
+    assert(!cacheBlocks[block_index].dirty);
+    assert(!cacheBlocks[block_index].needsPreWBApply);
+    assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
 
     bool need_send_pkt = true;
+
+    // NOTE: Search postPushWBQueue
     for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
     {
         PacketPtr wb_pkt = std::get<0>(*wb);
-        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
+        if (cacheBlocks[block_index].addr = wb_pkt->getAddr()) {
             wb_pkt->writeDataToBlock(
                 (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
-            cacheBlocks[block_index].needsWB = true;
-            for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
-                Addr miss_addr = *it;
-                Addr aligned_miss_addr =
-                    roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-
-                if (aligned_miss_addr == cacheBlocks[block_index].addr) {
-                    int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
-                                "cacheBlocks[%d] can be serviced with the received "
-                                "packet.\n",__func__, miss_addr, block_index);
-                    // TODO: Make this block of code into a function
-                    responseQueue.push_back(std::make_tuple(miss_addr,
-                            cacheBlocks[block_index].items[wl_offset], curTick()));
-                    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                                "to responseQueue. responseQueue.size = %d.\n",
-                                __func__, miss_addr,
-                                graphWorkload->printWorkListItem(
-                                    cacheBlocks[block_index].items[wl_offset]),
-                                responseQueue.size());
-                    DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                                "to responseQueue. responseQueue.size = %d.\n",
-                                __func__, miss_addr,
-                                graphWorkload->printWorkListItem(
-                                    cacheBlocks[block_index].items[wl_offset]),
-                                responseQueue.size());
-                    // TODO: Add a stat to count the number of WLItems that have been touched.
-                    cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                                block_index, cacheBlocks[block_index].to_string());
-                    it = MSHR[block_index].erase(it);
-                } else {
-                    it++;
-                }
-            }
-            if (MSHR[block_index].empty()) {
-                MSHR.erase(block_index);
-            }
-
-            if ((!nextResponseEvent.scheduled()) &&
-                (!responseQueue.empty())) {
-                schedule(nextResponseEvent, nextCycle());
-            }
+            cacheBlocks[block_index].dirty = true;
+            need_send_pkt = false;
             postPushWBQueue.erase(wb);
+        }
+    }
+    for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) {
+        PacketPtr ab_pkt = std::get<0>(*ab);
+        if (cacheBlocks[block_index].addr = ab_pkt->getAddr()) {
+            ab_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
             need_send_pkt = false;
+            activeBuffer.erase(ab);
         }
     }
+    if (!need_send_pkt) {
+        cacheBlocks[block_index].valid = true;
+        cacheBlocks[block_index].needsPreWBApply = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
+        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+            Addr miss_addr = *it;
+            Addr aligned_miss_addr =
+                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+            assert(aligned_miss_addr == cacheBlocks[block_index].addr);
+            int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                        "cacheBlocks[%d] can be serviced with the received "
+                        "packet.\n",__func__, miss_addr, block_index);
+            // TODO: Make this block of code into a function
+            responseQueue.push_back(std::make_tuple(miss_addr,
+                    cacheBlocks[block_index].items[wl_offset], curTick()));
+            DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
+                        responseQueue.size());
+            DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
+                        responseQueue.size());
+            cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+                    __func__, block_index,
+                    cacheBlocks[block_index].to_string());
+            it = MSHR[block_index].erase(it);
+        }
+        assert(MSHR[block_index].empty());
+        MSHR.erase(block_index);
+        if ((!nextResponseEvent.scheduled()) &&
+            (!responseQueue.empty())) {
+            schedule(nextResponseEvent, nextCycle());
+        }
+        cacheBlocks[block_index].state = CacheState::BUSY;
+    }
 
     if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) !=
-        pendingVertexPullReads.end()) {
+                                                pendingVertexPullReads.end()) {
         need_send_pkt = false;
     }
 
@@ -964,11 +840,6 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
                 "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
         memPort.sendPacket(pkt);
         onTheFlyReqs++;
-
-        if (pendingVertexPullReads.find(pkt->getAddr()) !=
-            pendingVertexPullReads.end()) {
-            stats.numDoubleMemReads++;
-        }
     }
 }
 
@@ -979,19 +850,27 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                                                 __func__, block_index);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
+
     if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
         assert(cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
-        assert(cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(!cacheBlocks[block_index].pendingData);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(cacheBlocks[block_index].pendingWB);
-
-        // Why would we write it back if it does not have a conflict.
-        assert(MSHR.size() <= numMSHREntries);
-        assert(MSHR.find(block_index) != MSHR.end());
+        assert(cacheBlocks[block_index].dirty);
+        assert(cacheBlocks[block_index].hasConflict);
+        assert(!cacheBlocks[block_index].needsPreWBApply);
+        assert(cacheBlocks[block_index].state == CacheState::PENDING_WB);
 
+        Addr base_addr = cacheBlocks[block_index].addr;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            if (cacheBlocks[block_index].items[index].active) {
+                Addr vertex_addr = base_addr + index * sizeof(WorkListItem);
+                // NOTE: Implement this
+                // workdir.activate()
+                // cacheWorkCount--;
+            }
+        }
+        if (activeCacheBlocks.find(block_index)) {
+            activeCacheBlocks.erase(block_index);
+        }
         PacketPtr pkt = createWritePacket(
                 cacheBlocks[block_index].addr, peerMemoryAtomSize,
                 (uint8_t*) cacheBlocks[block_index].items);
@@ -999,30 +878,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                         "Addr: %lu, size = %d.\n", __func__,
                         pkt->getAddr(), pkt->getSize());
         memPort.sendPacket(pkt);
-        // onTheFlyReqs++;
-        cacheBlocks[block_index].needsWB = false;
-        cacheBlocks[block_index].pendingWB = false;
-
-        Addr miss_addr = MSHR[block_index].front();
-        Addr aligned_miss_addr =
-                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-        DPRINTF(CoalesceEngine, "%s: First conflicting address for"
-                    " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
-                    __func__, block_index, miss_addr, aligned_miss_addr);
-
-        cacheBlocks[block_index].addr = aligned_miss_addr;
-        cacheBlocks[block_index].valid = false;
-        cacheBlocks[block_index].busyMask = 0;
-        cacheBlocks[block_index].needsWB = false;
-        cacheBlocks[block_index].needsApply = false;
-        cacheBlocks[block_index].pendingData = true;
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].pendingWB = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
-        memoryFunctionQueue.emplace_back(
-            [this] (int block_index, Tick schedule_tick) {
-            processNextRead(block_index, schedule_tick);
-        }, block_index, curTick());
+        cacheBlocks[block_index].reset();
         DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
                 " %d to memoryFunctionQueue.\n", __func__, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
@@ -1049,55 +905,6 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
     }
 }
 
-std::tuple<WorkLocation, Addr, int>
-CoalesceEngine::getOptimalPullAddr()
-{
-    int visited_bits = 0;
-    int num_intial_active_bits = activeBits.size();
-    while (visited_bits < num_intial_active_bits) {
-        int index = activeBits.front();
-        int base_index = roundDown<int, int>(index, numElementsPerLine);
-        int index_offset = index - base_index;
-        assert(needsPush[index] == 1);
-        assert(index_offset < numElementsPerLine);
-
-        Addr addr = getBlockAddrFromBitIndex(base_index);
-        int block_index = getBlockIndex(addr);
-        if (pendingVertexPullReads.find(addr) != pendingVertexPullReads.end())
-        {
-            uint64_t send_mask = pendingVertexPullReads[addr];
-            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
-            assert(vertex_send_mask == 0);
-            activeBits.pop_front();
-            return std::make_tuple(
-                                WorkLocation::PENDING_READ, addr, index_offset);
-        } else {
-            // Only if it is in cache and it is in idle state.
-            if ((cacheBlocks[block_index].addr == addr) &&
-                (cacheBlocks[block_index].valid) &&
-                (cacheBlocks[block_index].busyMask == 0) &&
-                (!cacheBlocks[block_index].pendingApply) &&
-                (!cacheBlocks[block_index].pendingWB)) {
-                assert(!cacheBlocks[block_index].needsApply);
-                assert(!cacheBlocks[block_index].pendingData);
-                activeBits.pop_front();
-                return std::make_tuple(
-                            WorkLocation::IN_CACHE, block_index, index_offset);
-            // Otherwise if it is in memory
-            } else if ((cacheBlocks[block_index].addr != addr)) {
-                activeBits.pop_front();
-                return std::make_tuple(
-                            WorkLocation::IN_MEMORY, addr, index_offset);
-            }
-        }
-        activeBits.pop_front();
-        activeBits.push_back(index);
-        visited_bits++;
-    }
-
-    return std::make_tuple(WorkLocation::GARBAGE, 0, 0);
-}
-
 void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
@@ -1262,8 +1069,6 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
                                             statistics::units::Second>::get(),
              "Rate at which vertices are pushed."),
-    ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
-             "Histogram on the length of the mshr entries."),
     ADD_STAT(bitvectorLength, statistics::units::Count::get(),
              "Histogram of the length of the bitvector."),
     ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 05e268270a..8da67c7b43 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -60,9 +60,26 @@ enum CacheState
     IDLE,
     PENDING_PRE_WB_APPLY,
     PENDING_WB,
+    PENDING_PRE_PUSH_APPLY,
     NUM_CACHE_STATE
 };
 
+const char* cacheStateStrings[NUM_CACHE_STATE] = {
+    "INVALID",
+    "PENDING_DATA",
+    "BUSY",
+    "IDLE",
+    "PENDING_PRE_WB_APPLY",
+    "PENDING_WB",
+    "PENDING_PRE_PUSH_APPLY"
+};
+
+enum ReadDestination
+{
+    READ_FOR_CACHE,
+    READ_FOR_PUSH
+};
+
 class MPU;
 
 
@@ -71,7 +88,6 @@ class MPU;
 class WorkDirectory
 {
   private:
-    CoalesceEngine* owner;
     Addr memoryAtomSize;
     int atomBlockSize;
     size_t elementSize;
@@ -88,7 +104,6 @@ class WorkDirectory
     void deactivate(Addr addr);
     int workCount();
     std::tuple<WorkLocation, Addr> getNextWork();
-
 };
 
 class CoalesceEngine : public BaseMemoryEngine
@@ -100,47 +115,54 @@ class CoalesceEngine : public BaseMemoryEngine
         Addr addr;
         uint64_t busyMask;
         bool valid;
-        bool needsApply;
-        bool needsWB;
-        bool pendingData;
-        bool pendingApply;
-        bool pendingWB;
-        Tick lastChangedTick;
+        bool dirty;
+        bool hasConflict;
+        bool needsPreWBApply;
         CacheState state;
-        // TODO: This might be useful in the future
-        // Tick lastWLWriteTick;
+        Tick lastChangedTick;
         Block() {}
         Block(int num_elements):
           addr(-1),
           busyMask(0),
           valid(false),
-          needsApply(false),
-          needsWB(false),
-          pendingData(false),
-          pendingApply(false),
-          pendingWB(false),
-          lastChangedTick(0),
-          state(CacheState::INVALID)
+          dirty(false),
+          hasConflict(false),
+          needsPreWBApply(false),
+          state(CacheState::INVALID),
+          lastChangedTick(0)
         {
           items = new WorkListItem [num_elements];
         }
 
+        void reset() {
+            addr = -1;
+            busyMask = 0;
+            valid = false;
+            dirty = false;
+            hasConflict = false;
+            needsPreWBApply = false;
+            state = CacheState::INVALID;
+            lastChangedTick = 0;
+        }
+
         std::string to_string() {
             return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
-                "needsApply: %s, needsWB: %s, pendingData: %s, "
-                "pendingApply: %s, pendingWB: %s, lastChangedTick: %lu}",
-                addr, busyMask, valid ? "true" : "false",
-                needsApply ? "true" : "false", needsWB ? "true" : "false",
-                pendingData ? "true" : "false", pendingApply ? "true" : "false",
-                pendingWB ? "true" : "false", lastChangedTick);
+                "dirty: %s, hasConflict: %s, needsPreWBApply: %s"
+                "state: %s, lastChangedTick: %lu}", addr, busyMask,
+                valid ? "true" : "false", dirty ? "true" : "false",
+                hasConflict ? "true" : "false",
+                needsPreWBApply ? "true" : "false",
+                cacheStateStrings[state], lastChangedTick);
         }
     };
 
-    struct SenderState : public Packet::SenderState
+    struct ReadPurpose : public Packet::SenderState
     {
-      bool isRetry;
-      SenderState(bool is_retry): isRetry(is_retry) {}
+      ReadDestination _dest;
+      ReadPurpose(ReadDestination dest): _dest(dest) {}
+      ReadDestination dest() { return _dest; }
     };
+
     MPU* owner;
     GraphWorkload* graphWorkload;
 
@@ -150,28 +172,33 @@ class CoalesceEngine : public BaseMemoryEngine
 
     int onTheFlyReqs;
     int numMSHREntries;
-    int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHR;
+
+    // Response route to WLEngine
     int maxRespPerCycle;
     std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
-    int _workCount;
+    // Tracking work in cache
+    int cacheWorkCount;
     int numPullsReceived;
-    UniqueFIFO<int> applyQueue;
-    std::bitset<MAX_BITVECTOR_SIZE> needsPush;
-    std::deque<int> activeBits;
+    UniqueFIFO<int> preWBApplyQueue;
+    // NOTE: Remember to erase from this upon eviction from cache
+    UniqueFIFO<int> activeCacheBlocks;
+
+    int pendingPullReads;
+    // A map from addr to sendMask. sendMask determines which bytes to
+    // send for push when getting the read response from memory.
+    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
+
+    int activeBufferSize;
     int postPushWBQueueSize;
+    std::deque<std::tuple<PacketPtr, Tick>> activeBuffer;
     std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
 
     int getBlockIndex(Addr addr);
+    // TODO: Should be moved to WorkDirectory
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
-    std::tuple<WorkLocation, Addr, int> getOptimalPullAddr();
-
-    int maxPotentialPostPushWB;
-    // A map from addr to sendMask. sendMask determines which bytes to
-    // send for push when getting the read response from memory.
-    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
@@ -188,6 +215,9 @@ class CoalesceEngine : public BaseMemoryEngine
     EventFunctionWrapper nextPreWBApplyEvent;
     void processNextPreWBApplyEvent();
 
+    EventFunctionWrapper nextPrePushApplyEvent;
+    void processNextPrePushApplyEvent();
+
     struct CoalesceStats : public statistics::Group
     {
         CoalesceStats(CoalesceEngine &coalesce);
@@ -223,7 +253,6 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Formula vertexPullBW;
         statistics::Formula vertexPushBW;
 
-        statistics::Histogram mshrEntryLength;
         statistics::Histogram bitvectorLength;
         statistics::Histogram responseQueueLatency;
         statistics::Histogram memoryFunctionLatency;
@@ -246,6 +275,8 @@ class CoalesceEngine : public BaseMemoryEngine
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
+    // FIXME: Update this to return sum of cacheWorkCount and WorkDirectory
+    // workcount.
     int workCount() { return _workCount; }
     void recvVertexPull();
 
diff --git a/src/accl/graph/sega/coalesce_engine_bak.cc b/src/accl/graph/sega/coalesce_engine_bak.cc
new file mode 100644
index 0000000000..7a064c1c2f
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine_bak.cc
@@ -0,0 +1,1308 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/coalesce_engine.hh"
+
+#include <bitset>
+
+#include "accl/graph/sega/mpu.hh"
+#include "base/intmath.hh"
+#include "debug/CacheBlockState.hh"
+#include "debug/CoalesceEngine.hh"
+#include "debug/SEGAStructureSize.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+CoalesceEngine::CoalesceEngine(const Params &params):
+    BaseMemoryEngine(params),
+    numLines((int) (params.cache_size / peerMemoryAtomSize)),
+    numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
+    onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
+    numTgtsPerMSHR(params.num_tgts_per_mshr),
+    maxRespPerCycle(params.max_resp_per_cycle), _workCount(0),
+    numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size),
+    maxPotentialPostPushWB(0),
+    nextMemoryEvent([this] {
+        processNextMemoryEvent();
+        }, name() + ".nextMemoryEvent"),
+    nextResponseEvent([this] {
+        processNextResponseEvent();
+        }, name() + ".nextResponseEvent"),
+    nextPreWBApplyEvent([this] {
+        processNextPreWBApplyEvent();
+        }, name() + ".nextPreWBApplyEvent"),
+    stats(*this)
+{
+    assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
+    cacheBlocks = new Block [numLines];
+    for (int i = 0; i < numLines; i++) {
+        cacheBlocks[i] = Block(numElementsPerLine);
+    }
+    needsPush.reset();
+}
+
+void
+CoalesceEngine::registerMPU(MPU* mpu)
+{
+    owner = mpu;
+}
+
+void
+CoalesceEngine::recvFunctional(PacketPtr pkt)
+{
+    if (pkt->isRead()) {
+        assert(pkt->getSize() == peerMemoryAtomSize);
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+
+        if ((cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].valid)) {
+            assert(cacheBlocks[block_index].busyMask == 0);
+            assert(!cacheBlocks[block_index].needsApply);
+            // NOTE: No need to check needsWB because there might be entries
+            // that have been updated and not written back in the cache.
+            // assert(!cacheBlocks[block_index].needsWB);
+            assert(!cacheBlocks[block_index].pendingApply);
+            assert(!cacheBlocks[block_index].pendingWB);
+
+            pkt->makeResponse();
+            pkt->setDataFromBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+        } else {
+            memPort.sendFunctional(pkt);
+        }
+    } else {
+        // TODO: Add and implement init function for GraphWorkload.
+        int bit_index_base = getBitIndexBase(pkt->getAddr());
+        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount);
+        memPort.sendFunctional(pkt);
+    }
+}
+
+bool
+CoalesceEngine::done()
+{
+    return applyQueue.empty() && needsPush.none() &&
+        memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
+}
+
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBlockIndex(Addr addr)
+{
+    assert((addr % peerMemoryAtomSize) == 0);
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
+}
+
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBitIndexBase(Addr addr)
+{
+    assert((addr % peerMemoryAtomSize) == 0);
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
+    int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
+    return atom_index * block_bits;
+}
+
+// index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
+Addr
+CoalesceEngine::getBlockAddrFromBitIndex(int index)
+{
+    assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
+    Addr trimmed_addr = index * sizeof(WorkListItem);
+    return peerMemoryRange.addIntlvBits(trimmed_addr);
+}
+
+bool
+CoalesceEngine::recvWLRead(Addr addr)
+{
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+    assert(aligned_addr % peerMemoryAtomSize == 0);
+    int block_index = getBlockIndex(aligned_addr);
+    assert(block_index < numLines);
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    assert(wl_offset < numElementsPerLine);
+    DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
+                        "This request maps to cacheBlocks[%d], aligned_addr: "
+                        "%lu, and wl_offset: %d.\n", __func__, addr,
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+
+    if ((cacheBlocks[block_index].addr == aligned_addr) &&
+        (cacheBlocks[block_index].valid)) {
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
+        stats.readHits++;
+        assert(!cacheBlocks[block_index].pendingData);
+        // No cache block could be in pendingApply and pendingWB at the
+        // same time.
+        assert(!(cacheBlocks[block_index].pendingApply &&
+                cacheBlocks[block_index].pendingWB));
+        // Hit
+        // TODO: Add a hit latency as a param for this object.
+        // Can't just schedule the nextResponseEvent for latency cycles in
+        // the future.
+        responseQueue.push_back(std::make_tuple(
+            addr, cacheBlocks[block_index].items[wl_offset], curTick()));
+
+        DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                        cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
+        DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
+        // TODO: Stat to count the number of WLItems that have been touched.
+        cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+        // If they are scheduled for apply and WB those schedules should be
+        // discarded. Since there is no easy way to take items out of the
+        // function queue. Those functions check for their respective bits
+        // and skip the process if the respective bit is set to false.
+        cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].pendingWB = false;
+        // HACK: If a read happens on the same cycle as another operation such
+        // as apply set lastChangedTick to half a cycle later so that operation
+        // scheduled by the original operation (apply in this example) are
+        // invalidated. For more details refer to "accl/graph/sega/busyMaskErr"
+        cacheBlocks[block_index].lastChangedTick =
+                                    curTick() + (Tick) (clockPeriod() / 2);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+
+        if (!nextResponseEvent.scheduled()) {
+            schedule(nextResponseEvent, nextCycle());
+        }
+        stats.numVertexReads++;
+        return true;
+    } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
+                (cacheBlocks[block_index].pendingData)) {
+        // Hit under miss
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit under miss.\n",
+                                                        __func__, addr);
+        stats.readHitUnderMisses++;
+        assert(!cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].needsWB);
+        assert(!cacheBlocks[block_index].needsApply);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(!cacheBlocks[block_index].pendingWB);
+
+        assert(MSHR.size() <= numMSHREntries);
+        assert(MSHR.find(block_index) != MSHR.end());
+        assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+        if (MSHR[block_index].size() == numTgtsPerMSHR) {
+            DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+                        "cacheBlocks[%d]. Rejecting request.\n",
+                                        __func__, block_index);
+            stats.mshrTargetShortage++;
+            return false;
+        } else {
+            DPRINTF(CoalesceEngine,  "%s: MSHR entries are available for "
+                            "cacheBlocks[%d].\n", __func__, block_index);
+        }
+        MSHR[block_index].push_back(addr);
+        stats.mshrEntryLength.sample(MSHR[block_index].size());
+        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                "for cacheBlocks[%d].\n", __func__, addr, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+        stats.numVertexReads++;
+        return true;
+    } else {
+        // miss
+        // FIXME: Make this assert work. It will break if the cache block
+        // is cold and addr or aligned_addr is 0. It fails because cache block
+        // addr field is initialized to 0. Unfortunately Addr type is unsigned.
+        // So you can not initialized addr to -1.
+        assert(cacheBlocks[block_index].addr != aligned_addr);
+        assert(MSHR.size() <= numMSHREntries);
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
+        if (MSHR.find(block_index) == MSHR.end()) {
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
+                    " %lu not found in MSHRs.\n", __func__, block_index, addr);
+            if (MSHR.size() == numMSHREntries) {
+                // Out of MSHR entries
+                DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
+                                "Rejecting request.\n", __func__);
+                // TODO: Break out read rejections into more than one stat
+                // based on the cause of the rejection
+                stats.mshrEntryShortage++;
+                return false;
+            } else {
+                DPRINTF(CoalesceEngine,  "%s: MSHR "
+                    "entries available.\n", __func__);
+                if ((cacheBlocks[block_index].valid) ||
+                    (cacheBlocks[block_index].pendingData)) {
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
+                                "with Addr: %lu.\n", __func__, addr,
+                                cacheBlocks[block_index].addr);
+                    if ((cacheBlocks[block_index].valid) &&
+                        (cacheBlocks[block_index].busyMask == 0) &&
+                        (!cacheBlocks[block_index].pendingApply) &&
+                        (!cacheBlocks[block_index].pendingWB)) {
+                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                                    "idle state.\n", __func__, block_index);
+                        // We're in idle state
+                        // Idle: valid && !pendingApply && !pendingWB;
+                        // Note 0: needsApply has to be false. Because
+                        // A cache line enters the idle state from two
+                        // other states. First a busy state that does not
+                        // need apply (needsApply is already false) or
+                        // from pendingApplyState after being applied which
+                        // clears the needsApply bit. needsApply is useful
+                        // when a cache block has transitioned from
+                        // pendingApply to busy without the apply happening.
+                        // Note 1: pendingData does not have to be evaluated
+                        // becuase pendingData is cleared when data
+                        // arrives from the memory and valid does not
+                        // denote cleanliness of the line. Rather it
+                        // is used to differentiate between empty blocks
+                        // and the blocks that have data from memory.
+                        // pendingData denotes the transient state between
+                        // getting a miss and getting the data for that miss.
+                        // valid basically means that the data in the cache
+                        // could be used to respond to read/write requests.
+                        assert(!cacheBlocks[block_index].needsApply);
+                        assert(!cacheBlocks[block_index].pendingData);
+                        // There are no conflicts in idle state.
+                        assert(MSHR.find(block_index) == MSHR.end());
+                        if (cacheBlocks[block_index].needsWB) {
+                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs"
+                            "to be written back.\n", __func__, block_index);
+                            cacheBlocks[block_index].pendingWB = true;
+                            cacheBlocks[block_index].lastChangedTick = curTick();
+                            memoryFunctionQueue.emplace_back(
+                                [this] (int block_index, Tick schedule_tick) {
+                                processNextWriteBack(block_index, schedule_tick);
+                            }, block_index, curTick());
+                            DPRINTF(CoalesceEngine, "%s: Pushed "
+                                        "processNextWriteBack for input "
+                                        "%d to memoryFunctionQueue.\n",
+                                        __func__, block_index);
+                            if ((!nextMemoryEvent.pending()) &&
+                                (!nextMemoryEvent.scheduled())) {
+                                schedule(nextMemoryEvent, nextCycle());
+                            }
+                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
+                                    "%s.\n", __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
+                        } else {
+                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does "
+                                            "not need to be written back.\n",
+                                                        __func__, block_index);
+                            cacheBlocks[block_index].addr = aligned_addr;
+                            cacheBlocks[block_index].valid = false;
+                            cacheBlocks[block_index].busyMask = 0;
+                            cacheBlocks[block_index].needsWB = false;
+                            cacheBlocks[block_index].needsApply = false;
+                            cacheBlocks[block_index].pendingData = true;
+                            cacheBlocks[block_index].pendingApply = false;
+                            cacheBlocks[block_index].pendingWB = false;
+                            cacheBlocks[block_index].lastChangedTick = curTick();
+                            memoryFunctionQueue.emplace_back(
+                                [this] (int block_index, Tick schedule_tick) {
+                                    processNextRead(block_index, schedule_tick);
+                                }, block_index, curTick());
+                            DPRINTF(CoalesceEngine, "%s: Pushed "
+                                        "processNextRead for input "
+                                        "%d to memoryFunctionQueue.\n",
+                                        __func__, block_index);
+                            if ((!nextMemoryEvent.pending()) &&
+                                (!nextMemoryEvent.scheduled())) {
+                                schedule(nextMemoryEvent, nextCycle());
+                            }
+                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
+                                    "%s.\n", __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
+                        }
+                    }
+                    // cacheBlocks[block_index].hasConflict = true;
+                    MSHR[block_index].push_back(addr);
+                    stats.mshrEntryLength.sample(MSHR[block_index].size());
+                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
+                    stats.readMisses++;
+                    // TODO: Add readConflicts here.
+                    stats.numVertexReads++;
+                    return true;
+                } else {
+                    // MSHR available and no conflict
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
+                                            "Allocating a cache line for it.\n"
+                                                            , __func__, addr);
+                    assert(!cacheBlocks[block_index].valid);
+                    assert(cacheBlocks[block_index].busyMask == 0);
+                    assert(!cacheBlocks[block_index].needsWB);
+                    assert(!cacheBlocks[block_index].needsApply);
+                    assert(!cacheBlocks[block_index].pendingData);
+                    assert(!cacheBlocks[block_index].pendingApply);
+                    assert(!cacheBlocks[block_index].pendingWB);
+                    assert(MSHR[block_index].size() == 0);
+
+                    cacheBlocks[block_index].addr = aligned_addr;
+                    cacheBlocks[block_index].busyMask = 0;
+                    cacheBlocks[block_index].valid = false;
+                    cacheBlocks[block_index].needsWB = false;
+                    cacheBlocks[block_index].needsApply = false;
+                    cacheBlocks[block_index].pendingData = true;
+                    cacheBlocks[block_index].pendingApply = false;
+                    cacheBlocks[block_index].pendingWB = false;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
+                                " Addr: %lu.\n", __func__, block_index, addr);
+                    MSHR[block_index].push_back(addr);
+                    stats.mshrEntryLength.sample(MSHR[block_index].size());
+                    DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
+                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextRead(block_index, schedule_tick);
+                        }, block_index, curTick());
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
+                                        "input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+                                    __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
+                    stats.readMisses++;
+                    stats.numVertexReads++;
+                    return true;
+                }
+            }
+        } else {
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
+                "Addr: %lu already in MSHRs. It has a conflict "
+                "with addr: %lu.\n", __func__, block_index, addr,
+                                cacheBlocks[block_index].addr);
+            assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+            assert(MSHR[block_index].size() > 0);
+            if (MSHR[block_index].size() == numTgtsPerMSHR) {
+                DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+                            "cacheBlocks[%d]. Rejecting request.\n",
+                                            __func__, block_index);
+                stats.mshrTargetShortage++;
+                return false;
+            }
+            DPRINTF(CoalesceEngine, "%s: There is room for another target "
+                            "for cacheBlocks[%d].\n", __func__, block_index);
+
+            // TODO: Might want to differentiate between different misses.
+            stats.readMisses++;
+
+            MSHR[block_index].push_back(addr);
+            stats.mshrEntryLength.sample(MSHR[block_index].size());
+            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
+                            "cacheBlocks[%d].\n", __func__, addr, block_index);
+            stats.numVertexReads++;
+            return true;
+        }
+    }
+}
+
+bool
+CoalesceEngine::handleMemResp(PacketPtr pkt)
+{
+    assert(pkt->isResponse());
+    DPRINTF(CoalesceEngine,  "%s: Received packet: %s from memory.\n",
+                                                __func__, pkt->print());
+    if (pkt->isWrite()) {
+        DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__);
+        delete pkt;
+        return true;
+    }
+
+    onTheFlyReqs--;
+    Addr addr = pkt->getAddr();
+    int block_index = getBlockIndex(addr);
+    WorkListItem* items = pkt->getPtr<WorkListItem>();
+
+    bool do_wb = false;
+    if (pkt->findNextSenderState<SenderState>()) {
+        assert(!((cacheBlocks[block_index].addr == addr) &&
+                (cacheBlocks[block_index].valid)));
+        // We have read the address to send the wl and it is not in the
+        // cache. Simply send the items to the PushEngine.
+
+        DPRINTF(CoalesceEngine, "%s: Received read response for pull read "
+                                "for addr %lu.\n", __func__, addr);
+        int it = getBitIndexBase(addr);
+        uint64_t send_mask = pendingVertexPullReads[addr];
+        // No applying of the line needed.
+        for (int i = 0; i < numElementsPerLine; i++) {
+            Addr vertex_addr = addr + i * sizeof(WorkListItem);
+            uint64_t vertex_send_mask = send_mask & (1 << i);
+            if (vertex_send_mask != 0) {
+                assert(needsPush[it + i] == 1);
+                needsPush[it + i] = 0;
+                _workCount--;
+
+                uint32_t delta;
+                bool do_push, do_wb_v;
+                std::tie(delta, do_push, do_wb_v) =
+                                        graphWorkload->prePushApply(items[i]);
+                do_wb |= do_wb_v;
+                if (do_push) {
+                    owner->recvVertexPush(vertex_addr, delta,
+                                        items[i].edgeIndex, items[i].degree);
+                } else {
+                    // TODO: Add a stat to count this.
+                    owner->recvPrevPullCorrection();
+                }
+                stats.verticesPushed++;
+                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+            }
+        }
+        pendingVertexPullReads.erase(addr);
+        maxPotentialPostPushWB--;
+    }
+
+    bool cache_wb = false;
+    if (cacheBlocks[block_index].addr == addr) {
+        DPRINTF(CoalesceEngine, "%s: Received read response to "
+                        "fill cacheBlocks[%d].\n", __func__, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+        assert(!cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].needsWB);
+        assert(!cacheBlocks[block_index].needsApply);
+        assert(cacheBlocks[block_index].pendingData);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(!cacheBlocks[block_index].pendingWB);
+        assert(MSHR.find(block_index) != MSHR.end());
+        std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize);
+        for (int i = 0; i < numElementsPerLine; i++) {
+            DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
+                __func__, block_index, i, graphWorkload->printWorkListItem(
+                                        cacheBlocks[block_index].items[i]));
+        }
+        cacheBlocks[block_index].valid = true;
+        cacheBlocks[block_index].needsWB |= do_wb;
+        cacheBlocks[block_index].pendingData = false;
+        // HACK: In case processNextRead is called on the same tick as curTick
+        // and is scheduled to read to the same cacheBlocks[block_index]
+        cacheBlocks[block_index].lastChangedTick =
+                                        curTick() - (Tick) (clockPeriod() / 2);
+        cache_wb = true;
+    } else if (do_wb) {
+        PacketPtr wb_pkt = createWritePacket(
+                                addr, peerMemoryAtomSize, (uint8_t*) items);
+        postPushWBQueue.emplace_back(wb_pkt, curTick());
+        memoryFunctionQueue.emplace_back(
+            [this] (int ignore, Tick schedule_tick) {
+                processNextPostPushWB(ignore, schedule_tick);
+            }, 0, curTick());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
+    } else {
+        // TODO: Add a stat to count this.
+        // FIXME: This is not a totally wasteful read. e.g. all reads
+        // for pull in BFS are like this.
+        DPRINTF(CoalesceEngine, "%s: No write destination for addr: %lu.\n", __func__, addr);
+    }
+
+    if (cache_wb) {
+        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+            Addr miss_addr = *it;
+            Addr aligned_miss_addr =
+                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
+            if (aligned_miss_addr == addr) {
+                int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+                DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                            "cacheBlocks[%d] can be serviced with the received "
+                            "packet.\n",__func__, miss_addr, block_index);
+                // TODO: Make this block of code into a function
+                responseQueue.push_back(std::make_tuple(miss_addr,
+                        cacheBlocks[block_index].items[wl_offset], curTick()));
+                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, miss_addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                // TODO: Add a stat to count the number of WLItems that have been touched.
+                cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+                // cacheBlocks[block_index].lastChangedTick = curTick();
+                DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                            block_index, cacheBlocks[block_index].to_string());
+                it = MSHR[block_index].erase(it);
+            } else {
+                it++;
+            }
+        }
+    }
+
+    if (MSHR[block_index].empty()) {
+        MSHR.erase(block_index);
+    }
+
+    if ((!nextResponseEvent.scheduled()) &&
+        (!responseQueue.empty())) {
+        schedule(nextResponseEvent, nextCycle());
+    }
+
+
+    // TODO: Probably check for done here too.
+    delete pkt;
+    return true;
+}
+
+// TODO: For loop to empty the entire responseQueue.
+void
+CoalesceEngine::processNextResponseEvent()
+{
+    int num_responses_sent = 0;
+
+    Addr addr_response;
+    WorkListItem worklist_response;
+    Tick response_queueing_tick;
+    while(true) {
+        std::tie(addr_response, worklist_response, response_queueing_tick) =
+                                                        responseQueue.front();
+        Tick waiting_ticks = curTick() - response_queueing_tick;
+        if (ticksToCycles(waiting_ticks) < 1) {
+            break;
+        }
+        owner->handleIncomingWL(addr_response, worklist_response);
+        num_responses_sent++;
+        DPRINTF(CoalesceEngine,
+                    "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
+                    __func__,
+                    graphWorkload->printWorkListItem(worklist_response),
+                    addr_response);
+
+        responseQueue.pop_front();
+        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
+                    "responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
+                    "responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        stats.responseQueueLatency.sample(
+                                    waiting_ticks * 1e9 / getClockFrequency());
+        if (num_responses_sent >= maxRespPerCycle) {
+            if (!responseQueue.empty()) {
+                stats.responsePortShortage++;
+            }
+            break;
+        }
+        if (responseQueue.empty()) {
+            break;
+        }
+    }
+
+    if ((!nextResponseEvent.scheduled()) &&
+        (!responseQueue.empty())) {
+        schedule(nextResponseEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
+{
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+    int block_index = getBlockIndex(aligned_addr);
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    DPRINTF(CoalesceEngine,  "%s: Received a write request for addr: %lu with "
+                        "wl: %s. This request maps to cacheBlocks[%d], "
+                        "aligned_addr: %lu, and wl_offset: %d.\n",
+                        __func__, addr, graphWorkload->printWorkListItem(wl),
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
+                "with Addr: %lu.\n", __func__,
+                graphWorkload->printWorkListItem(wl), addr);
+    // Desing does not allow for write misses for now.
+    assert(cacheBlocks[block_index].addr == aligned_addr);
+    // cache state asserts
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].busyMask != 0);
+    assert(!cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingApply);
+    assert(!cacheBlocks[block_index].pendingWB);
+
+    // respective bit in busyMask for wl is set.
+    assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
+            (1 << wl_offset));
+
+    if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
+        cacheBlocks[block_index].needsWB |= true;
+        stats.numVertexWrites++;
+    }
+    cacheBlocks[block_index].items[wl_offset] = wl;
+    if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) {
+        cacheBlocks[block_index].needsApply |= true;
+        cacheBlocks[block_index].needsWB |= true;
+    }
+
+    cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
+    cacheBlocks[block_index].lastChangedTick = curTick();
+    DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
+                __func__, block_index, wl_offset,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]));
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+
+    // TODO: Make this more general and programmable.
+    if ((cacheBlocks[block_index].busyMask == 0)) {
+        if (cacheBlocks[block_index].needsApply) {
+            cacheBlocks[block_index].pendingApply = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            applyQueue.push_back(block_index);
+            DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
+                            "applyQueue.\n", __func__, block_index);
+            if ((!applyQueue.empty()) &&
+                (!nextPreWBApplyEvent.scheduled())) {
+                schedule(nextPreWBApplyEvent, nextCycle());
+            }
+        } else {
+            assert(MSHR.size() <= numMSHREntries);
+            // cache line has conflict.
+            if (MSHR.find(block_index) != MSHR.end()) {
+                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
+                                    "conflict.\n", __func__, block_index);
+                if (cacheBlocks[block_index].needsWB) {
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write"
+                                            " back.\n", __func__, block_index);
+                    cacheBlocks[block_index].pendingWB = true;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextWriteBack(block_index, schedule_tick);
+                        }, block_index, curTick());
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack "
+                                    "for input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                } else {
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does not need"
+                                    " a write back.\n", __func__, block_index);
+                    Addr miss_addr = MSHR[block_index].front();
+                    Addr aligned_miss_addr =
+                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+                    DPRINTF(CoalesceEngine, "%s: First conflicting address for"
+                        " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
+                        __func__, block_index, miss_addr, aligned_miss_addr);
+                    cacheBlocks[block_index].addr = aligned_miss_addr;
+                    cacheBlocks[block_index].valid = false;
+                    cacheBlocks[block_index].busyMask = 0;
+                    cacheBlocks[block_index].needsWB = false;
+                    cacheBlocks[block_index].needsApply = false;
+                    cacheBlocks[block_index].pendingData = true;
+                    cacheBlocks[block_index].pendingApply = false;
+                    cacheBlocks[block_index].pendingWB = false;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextRead(block_index, schedule_tick);
+                        }, block_index, curTick());
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead "
+                                    "for input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                }
+            } else {
+                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                        "idle state now.\n", __func__, block_index);
+            }
+        }
+    }
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+
+}
+
+void
+CoalesceEngine::processNextPreWBApplyEvent()
+{
+    int block_index = applyQueue.front();
+    DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
+                "cacheBlock[%d] to be applied.\n", __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+            __func__, block_index, cacheBlocks[block_index].to_string());
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].needsApply);
+    assert(!cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingWB);
+
+    if (cacheBlocks[block_index].pendingApply) {
+        assert(cacheBlocks[block_index].busyMask == 0);
+        for (int index = 0; index < numElementsPerLine; index++) {
+            bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
+            if (do_push) {
+                int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
+                if (needsPush[bit_index_base + index] == 0) {
+                    needsPush[bit_index_base + index] = 1;
+                    _workCount++;
+                    activeBits.push_back(bit_index_base + index);
+                    if (!owner->running()) {
+                        owner->start();
+                    }
+                }
+            }
+        }
+        stats.bitvectorLength.sample(needsPush.count());
+
+        assert(cacheBlocks[block_index].needsWB);
+        cacheBlocks[block_index].needsApply = false;
+        cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
+
+        assert(MSHR.size() <= numMSHREntries);
+        if (MSHR.find(block_index) != MSHR.end()) {
+            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
+                                "conflicts.\n", __func__, block_index);
+            cacheBlocks[block_index].pendingWB = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            memoryFunctionQueue.emplace_back(
+                [this] (int block_index, Tick schedule_tick) {
+                processNextWriteBack(block_index, schedule_tick);
+            }, block_index, curTick());
+            DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input"
+                    " %d to memoryFunctionQueue.\n", __func__, block_index);
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+        } else {
+            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                    "idle state now.\n", __func__, block_index);
+        }
+        DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+    } else {
+        stats.numInvalidApplies++;
+    }
+
+    applyQueue.pop_front();
+    if ((!applyQueue.empty()) &&
+        (!nextPreWBApplyEvent.scheduled())) {
+        schedule(nextPreWBApplyEvent, nextCycle());
+    }
+
+    if (done()) {
+        owner->recvDoneSignal();
+    }
+}
+
+void
+CoalesceEngine::processNextMemoryEvent()
+{
+    if (memPort.blocked()) {
+        stats.numMemoryBlocks++;
+        nextMemoryEvent.sleep();
+        return;
+    }
+
+    DPRINTF(CoalesceEngine, "%s: Processing another "
+                        "memory function.\n", __func__);
+    std::function<void(int, Tick)> next_memory_function;
+    int next_memory_function_input;
+    Tick next_memory_function_tick;
+    std::tie(
+        next_memory_function,
+        next_memory_function_input,
+        next_memory_function_tick) = memoryFunctionQueue.front();
+    next_memory_function(next_memory_function_input, next_memory_function_tick);
+    memoryFunctionQueue.pop_front();
+    stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick)
+                                                * 1e9 / getClockFrequency());
+    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
+                                "memoryFunctionQueue.size = %d.\n", __func__,
+                                memoryFunctionQueue.size());
+
+    assert(!nextMemoryEvent.pending());
+    assert(!nextMemoryEvent.scheduled());
+    if ((!memoryFunctionQueue.empty())) {
+        schedule(nextMemoryEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n",
+                                            __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+        __func__, block_index, cacheBlocks[block_index].to_string());
+    // A cache block should not be touched while it's waiting for data.
+    // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
+
+    if (cacheBlocks[block_index].lastChangedTick != schedule_tick) {
+        return;
+    }
+
+    assert(!cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].busyMask == 0);
+    assert(!cacheBlocks[block_index].needsWB);
+    assert(!cacheBlocks[block_index].needsApply);
+    assert(cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingApply);
+    assert(!cacheBlocks[block_index].pendingWB);
+
+    bool need_send_pkt = true;
+    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
+    {
+        PacketPtr wb_pkt = std::get<0>(*wb);
+        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
+            wb_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+            cacheBlocks[block_index].needsWB = true;
+            for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+                Addr miss_addr = *it;
+                Addr aligned_miss_addr =
+                    roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
+                if (aligned_miss_addr == cacheBlocks[block_index].addr) {
+                    int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                                "cacheBlocks[%d] can be serviced with the received "
+                                "packet.\n",__func__, miss_addr, block_index);
+                    // TODO: Make this block of code into a function
+                    responseQueue.push_back(std::make_tuple(miss_addr,
+                            cacheBlocks[block_index].items[wl_offset], curTick()));
+                    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                                "to responseQueue. responseQueue.size = %d.\n",
+                                __func__, miss_addr,
+                                graphWorkload->printWorkListItem(
+                                    cacheBlocks[block_index].items[wl_offset]),
+                                responseQueue.size());
+                    DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                                "to responseQueue. responseQueue.size = %d.\n",
+                                __func__, miss_addr,
+                                graphWorkload->printWorkListItem(
+                                    cacheBlocks[block_index].items[wl_offset]),
+                                responseQueue.size());
+                    // TODO: Add a stat to count the number of WLItems that have been touched.
+                    cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                                block_index, cacheBlocks[block_index].to_string());
+                    it = MSHR[block_index].erase(it);
+                } else {
+                    it++;
+                }
+            }
+            if (MSHR[block_index].empty()) {
+                MSHR.erase(block_index);
+            }
+
+            if ((!nextResponseEvent.scheduled()) &&
+                (!responseQueue.empty())) {
+                schedule(nextResponseEvent, nextCycle());
+            }
+            postPushWBQueue.erase(wb);
+            need_send_pkt = false;
+        }
+    }
+
+    if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) !=
+        pendingVertexPullReads.end()) {
+        need_send_pkt = false;
+    }
+
+    if (need_send_pkt) {
+        PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
+                                        peerMemoryAtomSize);
+        DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
+                "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
+        memPort.sendPacket(pkt);
+        onTheFlyReqs++;
+
+        if (pendingVertexPullReads.find(pkt->getAddr()) !=
+            pendingVertexPullReads.end()) {
+            stats.numDoubleMemReads++;
+        }
+    }
+}
+
+void
+CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n",
+                                                __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
+        assert(cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(cacheBlocks[block_index].needsWB);
+        assert(!cacheBlocks[block_index].needsApply);
+        assert(!cacheBlocks[block_index].pendingData);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(cacheBlocks[block_index].pendingWB);
+
+        // Why would we write it back if it does not have a conflict.
+        assert(MSHR.size() <= numMSHREntries);
+        assert(MSHR.find(block_index) != MSHR.end());
+
+        PacketPtr pkt = createWritePacket(
+                cacheBlocks[block_index].addr, peerMemoryAtomSize,
+                (uint8_t*) cacheBlocks[block_index].items);
+        DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
+                        "Addr: %lu, size = %d.\n", __func__,
+                        pkt->getAddr(), pkt->getSize());
+        memPort.sendPacket(pkt);
+        // onTheFlyReqs++;
+        cacheBlocks[block_index].needsWB = false;
+        cacheBlocks[block_index].pendingWB = false;
+
+        Addr miss_addr = MSHR[block_index].front();
+        Addr aligned_miss_addr =
+                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+        DPRINTF(CoalesceEngine, "%s: First conflicting address for"
+                    " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
+                    __func__, block_index, miss_addr, aligned_miss_addr);
+
+        cacheBlocks[block_index].addr = aligned_miss_addr;
+        cacheBlocks[block_index].valid = false;
+        cacheBlocks[block_index].busyMask = 0;
+        cacheBlocks[block_index].needsWB = false;
+        cacheBlocks[block_index].needsApply = false;
+        cacheBlocks[block_index].pendingData = true;
+        cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].pendingWB = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
+        memoryFunctionQueue.emplace_back(
+            [this] (int block_index, Tick schedule_tick) {
+            processNextRead(block_index, schedule_tick);
+        }, block_index, curTick());
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
+                " %d to memoryFunctionQueue.\n", __func__, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+    } else {
+        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a "
+                            "write back has been scheduled for it. Ignoring "
+                            "the current write back scheduled at tick %lu for "
+                            "the right function scheduled later.\n",
+                            __func__, block_index, schedule_tick);
+        stats.numInvalidWriteBacks++;
+    }
+}
+
+void
+CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
+{
+    PacketPtr wb_pkt;
+    Tick pkt_tick;
+    std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
+    if (schedule_tick == pkt_tick) {
+        memPort.sendPacket(wb_pkt);
+        postPushWBQueue.pop_front();
+    }
+}
+
+std::tuple<WorkLocation, Addr, int>
+CoalesceEngine::getOptimalPullAddr()
+{
+    int visited_bits = 0;
+    int num_intial_active_bits = activeBits.size();
+    while (visited_bits < num_intial_active_bits) {
+        int index = activeBits.front();
+        int base_index = roundDown<int, int>(index, numElementsPerLine);
+        int index_offset = index - base_index;
+        assert(needsPush[index] == 1);
+        assert(index_offset < numElementsPerLine);
+
+        Addr addr = getBlockAddrFromBitIndex(base_index);
+        int block_index = getBlockIndex(addr);
+        if (pendingVertexPullReads.find(addr) != pendingVertexPullReads.end())
+        {
+            uint64_t send_mask = pendingVertexPullReads[addr];
+            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
+            assert(vertex_send_mask == 0);
+            activeBits.pop_front();
+            return std::make_tuple(
+                                WorkLocation::PENDING_READ, addr, index_offset);
+        } else {
+            // Only if it is in cache and it is in idle state.
+            if ((cacheBlocks[block_index].addr == addr) &&
+                (cacheBlocks[block_index].valid) &&
+                (cacheBlocks[block_index].busyMask == 0) &&
+                (!cacheBlocks[block_index].pendingApply) &&
+                (!cacheBlocks[block_index].pendingWB)) {
+                assert(!cacheBlocks[block_index].needsApply);
+                assert(!cacheBlocks[block_index].pendingData);
+                activeBits.pop_front();
+                return std::make_tuple(
+                            WorkLocation::IN_CACHE, block_index, index_offset);
+            // Otherwise if it is in memory
+            } else if ((cacheBlocks[block_index].addr != addr)) {
+                activeBits.pop_front();
+                return std::make_tuple(
+                            WorkLocation::IN_MEMORY, addr, index_offset);
+            }
+        }
+        activeBits.pop_front();
+        activeBits.push_back(index);
+        visited_bits++;
+    }
+
+    return std::make_tuple(WorkLocation::GARBAGE, 0, 0);
+}
+
+void
+CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
+{
+    WorkLocation bit_status;
+    Addr location;
+    int offset;
+
+    std::tie(bit_status, location, offset) = getOptimalPullAddr();
+
+    if (bit_status != WorkLocation::GARBAGE) {
+        if (bit_status == WorkLocation::PENDING_READ) {
+            // renaming the outputs to thier local names.
+            Addr addr = location;
+            int index_offset = offset;
+
+            uint64_t send_mask = pendingVertexPullReads[addr];
+            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
+            assert(vertex_send_mask == 0);
+            send_mask |= (1 << index_offset);
+            pendingVertexPullReads[addr] = send_mask;
+            numPullsReceived--;
+        }
+        if (bit_status == WorkLocation::IN_CACHE) {
+            // renaming the outputs to their local names.
+            int block_index = (int) location;
+            int wl_offset = offset;
+
+            Addr addr = cacheBlocks[block_index].addr;
+            Addr vertex_addr = addr + (wl_offset * sizeof(WorkListItem));
+            int slice_base_index = getBitIndexBase(addr);
+
+            needsPush[slice_base_index + wl_offset] = 0;
+            _workCount--;
+
+            uint32_t delta;
+            bool do_push, do_wb;
+            std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
+                                    cacheBlocks[block_index].items[wl_offset]);
+            cacheBlocks[block_index].needsWB |= do_wb;
+            if (do_push) {
+                owner->recvVertexPush(vertex_addr, delta,
+                        cacheBlocks[block_index].items[wl_offset].edgeIndex,
+                        cacheBlocks[block_index].items[wl_offset].degree);
+            } else {
+                DPRINTF(CoalesceEngine, "%s: Fuck!.\n", __func__);
+                owner->recvPrevPullCorrection();
+            }
+            stats.verticesPushed++;
+            stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+            numPullsReceived--;
+        }
+        if (bit_status == WorkLocation::IN_MEMORY) {
+            if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) {
+                Addr addr = location;
+                int index_offset = offset;
+                uint64_t send_mask = (1 << index_offset);
+                assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
+                PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+                SenderState* sender_state = new SenderState(true);
+                pkt->pushSenderState(sender_state);
+                memPort.sendPacket(pkt);
+                onTheFlyReqs++;
+                maxPotentialPostPushWB++;
+                pendingVertexPullReads[addr] = send_mask;
+                numPullsReceived--;
+            }
+        }
+    }
+
+    stats.bitvectorSearchStatus[bit_status]++;
+
+    if (numPullsReceived > 0) {
+        memoryFunctionQueue.emplace_back(
+            [this] (int slice_base, Tick schedule_tick) {
+            processNextVertexPull(slice_base, schedule_tick);
+        }, 0, curTick());
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
+                                    "0 to memoryFunctionQueue.\n", __func__);
+    }
+}
+
+void
+CoalesceEngine::recvMemRetry()
+{
+    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
+
+    if (!nextMemoryEvent.pending()) {
+        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
+        return;
+    }
+    assert(!nextMemoryEvent.scheduled());
+    nextMemoryEvent.wake();
+    schedule(nextMemoryEvent, nextCycle());
+}
+
+void
+CoalesceEngine::recvVertexPull()
+{
+    bool should_schedule = (numPullsReceived == 0);
+    numPullsReceived++;
+
+    stats.verticesPulled++;
+    stats.lastVertexPullTime = curTick() - stats.lastResetTick;
+    if (should_schedule) {
+        memoryFunctionQueue.emplace_back(
+            [this] (int slice_base, Tick schedule_tick) {
+            processNextVertexPull(slice_base, schedule_tick);
+        }, 0, curTick());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
+    }
+}
+
+CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
+    : statistics::Group(&_coalesce),
+    coalesce(_coalesce),
+    lastResetTick(0),
+    ADD_STAT(numVertexReads, statistics::units::Count::get(),
+             "Number of memory vertecies read from cache."),
+    ADD_STAT(numVertexWrites, statistics::units::Count::get(),
+             "Number of memory vertecies written to cache."),
+    ADD_STAT(readHits, statistics::units::Count::get(),
+             "Number of cache hits."),
+    ADD_STAT(readMisses, statistics::units::Count::get(),
+             "Number of cache misses."),
+    ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
+             "Number of cache hit under misses."),
+    ADD_STAT(mshrEntryShortage, statistics::units::Count::get(),
+             "Number of cache rejections caused by entry shortage."),
+    ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
+             "Number of cache rejections caused by target shortage."),
+    ADD_STAT(responsePortShortage, statistics::units::Count::get(),
+             "Number of times a response has been "
+             "delayed because of port shortage. "),
+    ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
+             "Number of times memory bandwidth was not available."),
+    ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
+             "Number of times a memory block has been read twice. "
+             "Once for push and once to populate the cache."),
+    ADD_STAT(verticesPulled, statistics::units::Count::get(),
+             "Number of times a pull request has been sent by PushEngine."),
+    ADD_STAT(verticesPushed, statistics::units::Count::get(),
+             "Number of times a vertex has been pushed to the PushEngine"),
+    ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(),
+             "Time of the last pull request. (Relative to reset_stats)"),
+    ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
+             "Time of the last vertex push. (Relative to reset_stats)"),
+    ADD_STAT(numInvalidApplies, statistics::units::Count::get(),
+             "Number of times a line has become busy"
+             " while waiting to be applied."),
+    ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(),
+             "Number of times a scheduled memory function has been invalid."),
+    ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
+             "Distribution for the location of vertex searches."),
+    ADD_STAT(hitRate, statistics::units::Ratio::get(),
+             "Hit rate in the cache."),
+    ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which pull requests arrive."),
+    ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which vertices are pushed."),
+    ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
+             "Histogram on the length of the mshr entries."),
+    ADD_STAT(bitvectorLength, statistics::units::Count::get(),
+             "Histogram of the length of the bitvector."),
+    ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
+             "Histogram of the response latency to WLEngine. (ns)"),
+    ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
+             "Histogram of the latency of processing a memory function.")
+{
+}
+
+void
+CoalesceEngine::CoalesceStats::regStats()
+{
+    using namespace statistics;
+
+    bitvectorSearchStatus.init(NUM_STATUS);
+    bitvectorSearchStatus.subname(0, "PENDING_READ");
+    bitvectorSearchStatus.subname(1, "IN_CACHE");
+    bitvectorSearchStatus.subname(2, "IN_MEMORY");
+    bitvectorSearchStatus.subname(3, "GARBAGE");
+
+    hitRate = (readHits + readHitUnderMisses) /
+                (readHits + readHitUnderMisses + readMisses);
+
+    vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime;
+
+    vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
+
+    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
+    bitvectorLength.init(64);
+    responseQueueLatency.init(64);
+    memoryFunctionLatency.init(64);
+}
+
+void
+CoalesceEngine::CoalesceStats::resetStats()
+{
+    statistics::Group::resetStats();
+
+    lastResetTick = curTick();
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/coalesce_engine_bak.hh b/src/accl/graph/sega/coalesce_engine_bak.hh
new file mode 100644
index 0000000000..0787a334c1
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine_bak.hh
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+
+#include <bitset>
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
+#include "base/cprintf.hh"
+#include "base/statistics.hh"
+#include "params/CoalesceEngine.hh"
+
+
+
+namespace gem5
+{
+
+enum WorkLocation
+{
+    PENDING_READ,
+    IN_CACHE,
+    IN_MEMORY,
+    GARBAGE,
+    NUM_STATUS
+};
+
+class MPU;
+
+class CoalesceEngine : public BaseMemoryEngine
+{
+  private:
+    struct Block
+    {
+        WorkListItem* items;
+        Addr addr;
+        uint64_t busyMask;
+        bool valid;
+        bool needsApply;
+        bool needsWB;
+        bool pendingData;
+        bool pendingApply;
+        bool pendingWB;
+        Tick lastChangedTick;
+        // TODO: This might be useful in the future
+        // Tick lastWLWriteTick;
+        Block() {}
+        Block(int num_elements):
+          addr(-1),
+          busyMask(0),
+          valid(false),
+          needsApply(false),
+          needsWB(false),
+          pendingData(false),
+          pendingApply(false),
+          pendingWB(false),
+          lastChangedTick(0),
+        {
+          items = new WorkListItem [num_elements];
+        }
+
+        std::string to_string() {
+            return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
+                "needsApply: %s, needsWB: %s, pendingData: %s, "
+                "pendingApply: %s, pendingWB: %s, lastChangedTick: %lu}",
+                addr, busyMask, valid ? "true" : "false",
+                needsApply ? "true" : "false", needsWB ? "true" : "false",
+                pendingData ? "true" : "false", pendingApply ? "true" : "false",
+                pendingWB ? "true" : "false", lastChangedTick);
+        }
+    };
+
+    struct SenderState : public Packet::SenderState
+    {
+      bool isRetry;
+      SenderState(bool is_retry): isRetry(is_retry) {}
+    };
+    MPU* owner;
+    GraphWorkload* graphWorkload;
+
+    int numLines;
+    int numElementsPerLine;
+    Block* cacheBlocks;
+
+    int onTheFlyReqs;
+    int numMSHREntries;
+    int numTgtsPerMSHR;
+    std::unordered_map<int, std::vector<Addr>> MSHR;
+    int maxRespPerCycle;
+    std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
+
+    int _workCount;
+    int numPullsReceived;
+    UniqueFIFO<int> applyQueue;
+    std::bitset<MAX_BITVECTOR_SIZE> needsPush;
+    std::deque<int> activeBits;
+    int postPushWBQueueSize;
+    std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
+
+    int getBlockIndex(Addr addr);
+    int getBitIndexBase(Addr addr);
+    Addr getBlockAddrFromBitIndex(int index);
+    std::tuple<WorkLocation, Addr, int> getOptimalPullAddr();
+
+    int maxPotentialPostPushWB;
+    // A map from addr to sendMask. sendMask determines which bytes to
+    // send for push when getting the read response from memory.
+    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
+
+    MemoryEvent nextMemoryEvent;
+    void processNextMemoryEvent();
+    void processNextRead(int block_index, Tick schedule_tick);
+    void processNextWriteBack(int block_index, Tick schedule_tick);
+    void processNextVertexPull(int ignore, Tick schedule_tick);
+    void processNextPostPushWB(int ignore, Tick schedule_tick);
+    std::deque<std::tuple<
+        std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
+
+    EventFunctionWrapper nextResponseEvent;
+    void processNextResponseEvent();
+
+    EventFunctionWrapper nextPreWBApplyEvent;
+    void processNextPreWBApplyEvent();
+
+    struct CoalesceStats : public statistics::Group
+    {
+        CoalesceStats(CoalesceEngine &coalesce);
+
+        virtual void regStats() override;
+
+        virtual void resetStats() override;
+
+        CoalesceEngine &coalesce;
+
+        Tick lastResetTick;
+
+        statistics::Scalar numVertexReads;
+        statistics::Scalar numVertexWrites;
+        statistics::Scalar readHits;
+        statistics::Scalar readMisses;
+        statistics::Scalar readHitUnderMisses;
+        statistics::Scalar mshrEntryShortage;
+        statistics::Scalar mshrTargetShortage;
+        statistics::Scalar responsePortShortage;
+        statistics::Scalar numMemoryBlocks;
+        statistics::Scalar numDoubleMemReads;
+        statistics::Scalar verticesPulled;
+        statistics::Scalar verticesPushed;
+        statistics::Scalar lastVertexPullTime;
+        statistics::Scalar lastVertexPushTime;
+        statistics::Scalar numInvalidApplies;
+        statistics::Scalar numInvalidWriteBacks;
+
+        statistics::Vector bitvectorSearchStatus;
+
+        statistics::Formula hitRate;
+        statistics::Formula vertexPullBW;
+        statistics::Formula vertexPushBW;
+
+        statistics::Histogram mshrEntryLength;
+        statistics::Histogram bitvectorLength;
+        statistics::Histogram responseQueueLatency;
+        statistics::Histogram memoryFunctionLatency;
+    };
+
+    CoalesceStats stats;
+
+  protected:
+    virtual void recvMemRetry() override;
+    virtual bool handleMemResp(PacketPtr pkt) override;
+
+  public:
+    PARAMS(CoalesceEngine);
+    CoalesceEngine(const Params &params);
+    void registerMPU(MPU* mpu);
+
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
+    virtual void recvFunctional(PacketPtr pkt);
+
+    bool recvWLRead(Addr addr);
+    void recvWLWrite(Addr addr, WorkListItem wl);
+
+    int workCount() { return _workCount; }
+    void recvVertexPull();
+
+    bool done();
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__

From 80b3803f040e09cae9f083e39d637c6445aab247 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 7 Nov 2022 00:05:27 -0800
Subject: [PATCH 211/247] First working and tested version of workdirectory.

---
 configs/accl/bfs.py                        |    1 +
 configs/accl/sega.py                       |    6 +-
 src/accl/graph/base/data_structs.hh        |   23 +-
 src/accl/graph/base/graph_workload.cc      |  236 ++--
 src/accl/graph/base/graph_workload.hh      |   67 +-
 src/accl/graph/sega/CenteralController.py  |    2 +-
 src/accl/graph/sega/CoalesceEngine.py      |    7 +-
 src/accl/graph/sega/CoalesceEngine_bak.py  |   50 -
 src/accl/graph/sega/SConscript             |    5 +-
 src/accl/graph/sega/centeral_controller.cc |   14 +-
 src/accl/graph/sega/centeral_controller.hh |    3 +-
 src/accl/graph/sega/coalesce_engine.cc     |  932 +++++++-------
 src/accl/graph/sega/coalesce_engine.hh     |  117 +-
 src/accl/graph/sega/coalesce_engine_bak.cc | 1308 --------------------
 src/accl/graph/sega/coalesce_engine_bak.hh |  218 ----
 src/accl/graph/sega/enums.cc               |   57 +
 src/accl/graph/sega/enums.hh               |   66 +
 src/accl/graph/sega/mpu.cc                 |    6 -
 src/accl/graph/sega/mpu.hh                 |    6 +-
 src/accl/graph/sega/push_engine.cc         |   37 +-
 src/accl/graph/sega/push_engine.hh         |    2 +-
 src/accl/graph/sega/wl_engine.cc           |   28 +-
 src/accl/graph/sega/wl_engine.hh           |    1 +
 src/accl/graph/sega/work_directory.hh      |  212 ++++
 src/mem/mem_ctrl.cc                        |    2 +-
 25 files changed, 1030 insertions(+), 2376 deletions(-)
 delete mode 100644 src/accl/graph/sega/CoalesceEngine_bak.py
 delete mode 100644 src/accl/graph/sega/coalesce_engine_bak.cc
 delete mode 100644 src/accl/graph/sega/coalesce_engine_bak.hh
 create mode 100644 src/accl/graph/sega/enums.cc
 create mode 100644 src/accl/graph/sega/enums.hh
 create mode 100644 src/accl/graph/sega/work_directory.hh

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index fc32b96642..a201acd4d1 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -68,6 +68,7 @@ def get_inputs():
 
     m5.instantiate()
 
+    system.create_pop_count_directory(256)
     system.create_bfs_workload(init_addr, init_value)
     exit_event = m5.simulate()
     print(
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 0f4b133791..54f22b1377 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -54,8 +54,8 @@ def __init__(self, edge_memory_size: str, cache_size: str):
             attached_memory_atom_size=32,
             cache_size=cache_size,
             num_mshr_entry=64,
-            num_tgts_per_mshr=64,
             max_resp_per_cycle=8,
+            active_buffer_size = 64,
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
@@ -139,6 +139,10 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
+    def create_pop_count_directory(self, atoms_per_block):
+        for gpt in self.gpts:
+            gpt.coalesce_engine.createPopCountDirectory(atoms_per_block)
+
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 070e635736..84233ae39c 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -36,8 +36,6 @@
 #include <cassert>
 #include <list>
 
-#define MAX_BITVECTOR_SIZE (1 << 28)
-
 namespace gem5
 {
 
@@ -45,33 +43,28 @@ struct __attribute__ ((packed)) WorkListItem
 {
     uint32_t tempProp : 32;
     uint32_t prop : 32;
+    uint32_t degree : 32;
     uint32_t edgeIndex : 32;
-    uint32_t degree : 31;
-    bool active: 1;
 
     std::string to_string()
     {
-        return csprintf(
-                "WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, "
-                "degree: %u, active: %s}", tempProp, prop, edgeIndex, degree,
-                active ? "true" : "false");
+        return csprintf("WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, "
+                            "degree: %u}", tempProp, prop, edgeIndex, degree);
     }
 
     WorkListItem():
         tempProp(0),
         prop(0),
-        edgeIndex(0),
         degree(0),
-        active(false)
+        edgeIndex(0)
     {}
 
     WorkListItem(uint32_t temp_prop, uint32_t prop,
-                uint32_t edge_index, uint32_t degree, bool active):
+                uint32_t degree, uint32_t edge_index):
         tempProp(temp_prop),
         prop(prop),
-        edgeIndex(edge_index),
         degree(degree),
-        active(active)
+        edgeIndex(edge_index)
     {}
 
 };
@@ -111,8 +104,8 @@ struct MetaEdge {
 
     std::string to_string()
     {
-        return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u}",
-                                                    src, dst, weight);
+        return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u, value: %u}",
+                                                    src, dst, weight, value);
     }
 };
 
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 07accff44f..446509201f 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -56,39 +56,27 @@ readFromFloat(float value)
     return float_bits;
 }
 
-BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size):
-    GraphWorkload(), initValue(init_value), atomSize(atom_size)
-{
-    initAddrBase = roundDown<uint64_t, int>(init_addr, atomSize);
-    initIndex = (init_addr - initAddrBase) / atomSize;
-    numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
-}
-
-
 void
-BFSWorkload::init(PacketPtr pkt, int bit_index_base,
-                std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                std::deque<int>& activeBits,
-                int& _workCount)
+BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
-    if (pkt->getAddr() == initAddrBase) {
-        WorkListItem items[numElementsPerLine];
+    size_t pkt_size = pkt->getSize();
+    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
 
-        pkt->writeDataToBlock((uint8_t*) items, atomSize);
+    if (pkt->getAddr() == aligned_addr) {
+        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+        WorkListItem items[num_elements];
 
-        items[initIndex].tempProp = initValue;
-        items[initIndex].prop = initValue;
-        if (items[initIndex].degree > 0) {
-            needsPush[bit_index_base + initIndex] = 1;
-            activeBits.push_back(bit_index_base + initIndex);
-            _workCount++;
-        }
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
 
+        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
+        items[index].tempProp = initValue;
+        if (activeCondition(items[index])) {
+            dir->activate(aligned_addr);
+        }
         pkt->deleteData();
         pkt->allocate();
-        pkt->setDataFromBlock((uint8_t*) items, atomSize);
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
     }
-
 }
 
 uint32_t
@@ -104,28 +92,16 @@ BFSWorkload::propagate(uint32_t value, uint32_t weight)
 }
 
 bool
-BFSWorkload::applyCondition(WorkListItem wl)
+BFSWorkload::activeCondition(WorkListItem wl)
 {
-    return wl.tempProp < wl.prop;
-}
-
-bool
-BFSWorkload::preWBApply(WorkListItem& wl)
-{
-    if (applyCondition(wl)) {
-        wl.prop = wl.tempProp;
-        if (wl.degree > 0) {
-            return true;
-        }
-    }
-    return false;
+    return (wl.tempProp < wl.prop) && (wl.degree > 0);
 }
 
-std::tuple<uint32_t, bool, bool>
-BFSWorkload::prePushApply(WorkListItem& wl)
+uint32_t
+BFSWorkload::apply(WorkListItem& wl)
 {
-    uint32_t value = wl.prop;
-    return std::make_tuple(value, true, false);
+    wl.prop = wl.tempProp;
+    return wl.prop;
 }
 
 std::string
@@ -137,92 +113,92 @@ BFSWorkload::printWorkListItem(const WorkListItem wl)
             );
 }
 
-PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
-    GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size)
-{
-    numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
-}
-
-void
-PRWorkload::init(PacketPtr pkt, int bit_index_base,
-                std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                std::deque<int>& activeBits,
-                int& _workCount)
-{
-    WorkListItem items[numElementsPerLine];
-
-    pkt->writeDataToBlock((uint8_t*) items, atomSize);
-    for (int i = 0; i < numElementsPerLine; i++) {
-        items[i].tempProp = readFromFloat<uint32_t>(0);
-        items[i].prop = readFromFloat<uint32_t>(1 - alpha);
-        if (items[i].degree > 0) {
-            needsPush[bit_index_base + i] = 1;
-            activeBits.push_back(bit_index_base + i);
-            _workCount++;
-        }
-    }
-    pkt->deleteData();
-    pkt->allocate();
-    pkt->setDataFromBlock((uint8_t*) items, atomSize);
-}
-
-uint32_t
-PRWorkload::reduce(uint32_t update, uint32_t value)
-{
-    float update_float = writeToFloat<uint32_t>(update);
-    float value_float = writeToFloat<uint32_t>(value);
-    return readFromFloat<uint32_t>(update_float + value_float);
-}
-
-uint32_t
-PRWorkload::propagate(uint32_t value, uint32_t weight)
-{
-    float value_float = writeToFloat<uint32_t>(value);
-    float weight_float = 1.0;
-
-    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
-}
-
-bool
-PRWorkload::applyCondition(WorkListItem wl)
-{
-    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-    float prop_float = writeToFloat<uint32_t>(wl.prop);
-    float dist = std::abs(temp_float - prop_float);
-    return dist >= threshold;
-}
-
-bool
-PRWorkload::preWBApply(WorkListItem& wl)
-{
-    if (applyCondition(wl) && (wl.degree > 0)) {
-        return true;
-    }
-    return false;
-}
-
-std::tuple<uint32_t, bool, bool>
-PRWorkload::prePushApply(WorkListItem& wl)
-{
-    if (applyCondition(wl)) {
-        float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-        float prop_float = writeToFloat<uint32_t>(wl.prop);
-        float delta = (temp_float - prop_float) / wl.degree;
-        uint32_t delta_uint = readFromFloat<uint32_t>(delta);
-        wl.prop = wl.tempProp;
-        return std::make_tuple(delta_uint, true, true);
-    }
-    return std::make_tuple(0, false, false);
-}
-
-std::string
-PRWorkload::printWorkListItem(const WorkListItem wl)
-{
-    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-    return csprintf(
-            "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
-            temp_float, temp_float, wl.degree, wl.edgeIndex
-            );
-}
+// PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
+//     GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size)
+// {
+//     numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
+// }
+
+// void
+// PRWorkload::init(PacketPtr pkt, int bit_index_base,
+//                 std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
+//                 std::deque<int>& activeBits,
+//                 int& _workCount)
+// {
+//     WorkListItem items[numElementsPerLine];
+
+//     pkt->writeDataToBlock((uint8_t*) items, atomSize);
+//     for (int i = 0; i < numElementsPerLine; i++) {
+//         items[i].tempProp = readFromFloat<uint32_t>(0);
+//         items[i].prop = readFromFloat<uint32_t>(1 - alpha);
+//         if (items[i].degree > 0) {
+//             needsPush[bit_index_base + i] = 1;
+//             activeBits.push_back(bit_index_base + i);
+//             _workCount++;
+//         }
+//     }
+//     pkt->deleteData();
+//     pkt->allocate();
+//     pkt->setDataFromBlock((uint8_t*) items, atomSize);
+// }
+
+// uint32_t
+// PRWorkload::reduce(uint32_t update, uint32_t value)
+// {
+//     float update_float = writeToFloat<uint32_t>(update);
+//     float value_float = writeToFloat<uint32_t>(value);
+//     return readFromFloat<uint32_t>(update_float + value_float);
+// }
+
+// uint32_t
+// PRWorkload::propagate(uint32_t value, uint32_t weight)
+// {
+//     float value_float = writeToFloat<uint32_t>(value);
+//     float weight_float = 1.0;
+
+//     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+// }
+
+// bool
+// PRWorkload::applyCondition(WorkListItem wl)
+// {
+//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//     float prop_float = writeToFloat<uint32_t>(wl.prop);
+//     float dist = std::abs(temp_float - prop_float);
+//     return dist >= threshold;
+// }
+
+// bool
+// PRWorkload::preWBApply(WorkListItem& wl)
+// {
+//     if (applyCondition(wl) && (wl.degree > 0)) {
+//         return true;
+//     }
+//     return false;
+// }
+
+// std::tuple<uint32_t, bool, bool>
+// PRWorkload::apply(WorkListItem& wl)
+// {
+//     if (applyCondition(wl)) {
+//         float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//         float prop_float = writeToFloat<uint32_t>(wl.prop);
+//         float delta = (temp_float - prop_float) / wl.degree;
+//         uint32_t delta_uint = readFromFloat<uint32_t>(delta);
+//         wl.prop = wl.tempProp;
+//         return std::make_tuple(delta_uint, true, true);
+//     }
+//     return std::make_tuple(0, false, false);
+// }
+
+// std::string
+// PRWorkload::printWorkListItem(const WorkListItem wl)
+// {
+//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//     return csprintf(
+//             "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
+//             temp_float, temp_float, wl.degree, wl.edgeIndex
+//             );
+// }
 
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 6bbc4935c2..f71955bd16 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -34,6 +34,7 @@
 #include <tuple>
 
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/work_directory.hh"
 #include "mem/packet.hh"
 
 
@@ -46,70 +47,54 @@ class GraphWorkload
     GraphWorkload() {}
     ~GraphWorkload() {}
 
-    virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits,
-                    int& _workCount) = 0;
+    virtual void init(PacketPtr pkt, WorkDirectory* dir) = 0;
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
-    virtual bool applyCondition(WorkListItem wl) = 0;
-    virtual bool preWBApply(WorkListItem& wl) = 0;
-    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl) = 0;
+    virtual uint32_t apply(WorkListItem& wl) = 0;
+    virtual bool activeCondition(WorkListItem wl) = 0;
     virtual std::string printWorkListItem(const WorkListItem wl) = 0;
 };
 
 class BFSWorkload : public GraphWorkload
 {
   private:
-    uint64_t initAddrBase;
-    int initIndex;
+    uint64_t initAddr;
     uint32_t initValue;
-    int numElementsPerLine;
-    int atomSize;
 
   public:
-    BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size);
+    BFSWorkload(uint64_t init_addr, uint32_t init_value):
+        initAddr(init_addr), initValue(init_value)
+    {}
 
     ~BFSWorkload() {}
 
-    virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits,
-                    int& _workCount);
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual bool applyCondition(WorkListItem wl);
-    virtual bool preWBApply(WorkListItem& wl);
-    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
 
-class PRWorkload : public GraphWorkload
-{
-  private:
-    float alpha;
-    float threshold;
+// class PRWorkload : public GraphWorkload
+// {
+//   private:
+//     float alpha;
+//     float threshold;
 
-    int numElementsPerLine;
-    int atomSize;
-
-  public:
-    PRWorkload(float alpha, float threshold, int atom_size);
+//   public:
+//     PRWorkload(float alpha, float threshold);
 
-    ~PRWorkload() {}
+//     ~PRWorkload() {}
 
-    virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits,
-                    int& _workCount);
-    virtual uint32_t reduce(uint32_t update, uint32_t value);
-    virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual bool applyCondition(WorkListItem wl);
-    virtual bool preWBApply(WorkListItem& wl);
-    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
-    virtual std::string printWorkListItem(const WorkListItem wl);
-};
+//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
+//     virtual uint32_t reduce(uint32_t update, uint32_t value);
+//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
+//     virtual uint32_t apply(WorkListItem& wl);
+//     virtual bool activeCondition(WorkListItem wl);
+//     virtual std::string printWorkListItem(const WorkListItem wl);
+// };
 
 }
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 09a997696d..0c21833a05 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -43,6 +43,6 @@ class CenteralController(ClockedObject):
 
     cxx_exports = [
                     PyBindMethod("createBFSWorkload"),
-                    PyBindMethod("createPRWorkload"),
+                    # PyBindMethod("createPRWorkload"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 8ec9214b49..a447dedc3d 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -27,6 +27,7 @@
 
 from m5.params import *
 from m5.proxy import *
+from m5.util.pybind import PyBindMethod
 from m5.objects.BaseMemoryEngine import BaseMemoryEngine
 
 class CoalesceEngine(BaseMemoryEngine):
@@ -40,9 +41,13 @@ class CoalesceEngine(BaseMemoryEngine):
 
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
-
+    active_buffer_size = Param.Int("Maximum number of memory active memory "
+                                "atoms ready to send updates. This parameter "
+                                "and post_push_wb_queue_size should be set "
+                                "in tandem. Probably, they should be equal.")
     post_push_wb_queue_size = Param.Int("Maximum number of pending wb after "
                                 "apply process for applications that require "
                                 "the apply process to happen exactly before "
                                 "pushing the edgePointer to the PushEngine.")
 
+    cxx_exports = [PyBindMethod("createPopCountDirectory")]
diff --git a/src/accl/graph/sega/CoalesceEngine_bak.py b/src/accl/graph/sega/CoalesceEngine_bak.py
deleted file mode 100644
index 1fd3b968c5..0000000000
--- a/src/accl/graph/sega/CoalesceEngine_bak.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseMemoryEngine import BaseMemoryEngine
-
-class CoalesceEngine(BaseMemoryEngine):
-    type = 'CoalesceEngine'
-    cxx_header = "accl/graph/sega/coalesce_engine.hh"
-    cxx_class = 'gem5::CoalesceEngine'
-
-    cache_size = Param.MemorySize("Size of the internal SRAM array.")
-
-    num_mshr_entry = Param.Int("Number of MSHR entries.")
-
-    num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
-
-    max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
-                                "requestor in each cycle. Used to limit b/w.")
-
-    post_push_wb_queue_size = Param.Int("Maximum number of pending wb after "
-                                "apply process for applications that require "
-                                "the apply process to happen exactly before "
-                                "pushing the edgePointer to the PushEngine.")
-
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 5d411be9ac..b3e1a838fb 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -37,6 +37,7 @@ SimObject("WLEngine.py", sim_objects=["WLEngine"])
 Source("base_memory_engine.cc")
 Source("centeral_controller.cc")
 Source("coalesce_engine.cc")
+Source("enums.cc")
 Source("mpu.cc")
 Source("push_engine.cc")
 Source("wl_engine.cc")
@@ -45,10 +46,10 @@ DebugFlag("BaseMemoryEngine")
 DebugFlag("CenteralController")
 DebugFlag("CacheBlockState")
 DebugFlag("CoalesceEngine")
-DebugFlag("FinalAnswer")
 DebugFlag("PushEngine")
 DebugFlag("SEGAStructureSize")
+DebugFlag("MSDebug")
 DebugFlag("WLEngine")
 
 CompoundFlag("MPU", ["CoalesceEngine", "PushEngine",
-                    "WLEngine", "BaseMemoryEngine"])
\ No newline at end of file
+                    "WLEngine", "BaseMemoryEngine"])
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index fc2262e111..883992e64e 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -82,6 +82,7 @@ CenteralController::startup()
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 
     for (auto mpu: mpuVector) {
+        mpu->postMemInitSetup();
         if (!mpu->running() && (mpu->workCount()> 0)) {
             mpu->start();
         }
@@ -106,14 +107,14 @@ CenteralController::createReadPacket(Addr addr, unsigned int size)
 void
 CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
 {
-    workload = new BFSWorkload(init_addr, init_value, system->cacheLineSize());
+    workload = new BFSWorkload(init_addr, init_value);
 }
 
-void
-CenteralController::createPRWorkload(float alpha, float threshold)
-{
-    workload = new PRWorkload(alpha, threshold, system->cacheLineSize());
-}
+// void
+// CenteralController::createPRWorkload(float alpha, float threshold)
+// {
+//     workload = new PRWorkload(alpha, threshold, system->cacheLineSize());
+// }
 
 void
 CenteralController::recvDoneSignal()
@@ -144,6 +145,7 @@ CenteralController::printAnswerToHostSimout()
         }
         pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
         for (int i = 0; i < num_items; i++) {
+            workload->apply(items[i]);
             std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i,
                                         workload->printWorkListItem(items[i]));
 
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 9ddb1b35f0..6eb07dbcac 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -35,7 +35,6 @@
 #include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "base/addr_range.hh"
-#include "debug/FinalAnswer.hh"
 #include "params/CenteralController.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
@@ -64,7 +63,7 @@ class CenteralController : public ClockedObject
     virtual void startup() override;
 
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
-    void createPRWorkload(float alpha, float threshold);
+    // void createPRWorkload(float alpha, float threshold);
 
     void recvDoneSignal();
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 66ff66c068..0aa61345f7 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -34,6 +34,7 @@
 #include "base/intmath.hh"
 #include "debug/CacheBlockState.hh"
 #include "debug/CoalesceEngine.hh"
+#include "debug/MSDebug.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
@@ -42,26 +43,23 @@ namespace gem5
 {
 
 CoalesceEngine::CoalesceEngine(const Params &params):
-    BaseMemoryEngine(params),
+    BaseMemoryEngine(params), lastAtomAddr(0),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
-    maxRespPerCycle(params.max_resp_per_cycle), cacheWorkCount(0),
-    numPullsReceived(0), activeBufferSize(params.post_push_wb_queue_size),
+    maxRespPerCycle(params.max_resp_per_cycle),
+    pullsReceived(0), pullsScheduled(0), pendingPullReads(0),
+    activeBufferSize(params.active_buffer_size),
     postPushWBQueueSize(params.post_push_wb_queue_size),
-    pendingPullReads(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
     nextResponseEvent([this] {
         processNextResponseEvent();
         }, name() + ".nextResponseEvent"),
-    nextPreWBApplyEvent([this] {
-        processNextPreWBApplyEvent();
-        }, name() + ".nextPreWBApplyEvent"),
-    nextPrePushApplyEvent([this] {
-        processNextPrePushApplyEvent();
-        }, name() + ".nextPrePushApplyEvent"),
+    nextApplyEvent([this] {
+        processNextApplyEvent();
+        }, name() + ".nextApplyEvent"),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -69,6 +67,8 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
+    activeBuffer.clear();
+    postPushWBQueue.clear();
 }
 
 void
@@ -85,7 +85,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
         Addr addr = pkt->getAddr();
         int block_index = getBlockIndex(addr);
 
-        // TODO: Check postPushWBQueue for hits
+        // FIXME: Check postPushWBQueue for hits
         if ((cacheBlocks[block_index].addr == addr) &&
             (cacheBlocks[block_index].valid)) {
             assert(cacheBlocks[block_index].state == CacheState::IDLE);
@@ -97,54 +97,70 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
             memPort.sendFunctional(pkt);
         }
     } else {
-        int bit_index_base = getBitIndexBase(pkt->getAddr());
-        // FIXME: Pass workdirectory to graphworkload.init
-        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount);
+        graphWorkload->init(pkt, directory);
+        if (pkt->getAddr() > lastAtomAddr) {
+            lastAtomAddr = pkt->getAddr();
+        }
         memPort.sendFunctional(pkt);
     }
 }
 
+void
+CoalesceEngine::postMemInitSetup()
+{
+    directory->setLastAtomAddr(lastAtomAddr);
+}
+
+void
+CoalesceEngine::createPopCountDirectory(int atoms_per_block)
+{
+    directory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+}
+
 bool
 CoalesceEngine::done()
 {
-    // FIXME: Fix this later
-    return applyQueue.empty() && needsPush.none() &&
-        memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
+    return memoryFunctionQueue.empty() && activeCacheBlocks.empty() &&
+        activeBuffer.empty() && directory->empty() && (onTheFlyReqs == 0);
 }
 
-// addr should be aligned to peerMemoryAtomSize
-int
-CoalesceEngine::getBlockIndex(Addr addr)
+bool
+CoalesceEngine::timeToPull()
 {
-    assert((addr % peerMemoryAtomSize) == 0);
-    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
-    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
+    return (activeBuffer.size() + pendingPullReads) < activeBufferSize;
 }
 
-// FIXME: This and the next function should be moved to the
-// WorkDirectory.
-// addr should be aligned to peerMemoryAtomSize
-int
-CoalesceEngine::getBitIndexBase(Addr addr)
+bool
+CoalesceEngine::canSchedulePull()
 {
-    assert((addr % peerMemoryAtomSize) == 0);
-    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
-    int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
-    int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
-    return atom_index * block_bits;
+    // TODO: Maybe a good idea to change this to
+    // activeBuffer.size() + pendingPullReads + pullsScheduled < activeBufferSize
+    return pullsScheduled < 1;
 }
 
-// FIXME: Read FIXME: Above
-// index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
-Addr
-CoalesceEngine::getBlockAddrFromBitIndex(int index)
+bool
+CoalesceEngine::workLeftInMem()
 {
-    assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
-    Addr trimmed_addr = index * sizeof(WorkListItem);
-    return peerMemoryRange.addIntlvBits(trimmed_addr);
+    return !directory->empty();
 }
 
 bool
+CoalesceEngine::pullCondition()
+{
+    return ((activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize);
+}
+
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBlockIndex(Addr addr)
+{
+    assert((addr % peerMemoryAtomSize) == 0);
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
+}
+
+ReadReturnStatus
 CoalesceEngine::recvWLRead(Addr addr)
 {
     Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
@@ -163,6 +179,9 @@ CoalesceEngine::recvWLRead(Addr addr)
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
+        if (cacheBlocks[block_index].state == CacheState::LOCKED_FOR_APPLY) {
+            return ReadReturnStatus::REJECT_NO_ROLL;
+        }
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
         stats.readHits++;
         assert(cacheBlocks[block_index].state != CacheState::INVALID);
@@ -197,7 +216,7 @@ CoalesceEngine::recvWLRead(Addr addr)
             schedule(nextResponseEvent, nextCycle());
         }
         stats.numVertexReads++;
-        return true;
+        return ReadReturnStatus::ACCEPT;
     } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
                 (cacheBlocks[block_index].state == CacheState::PENDING_DATA)) {
         // Hit under miss
@@ -207,7 +226,6 @@ CoalesceEngine::recvWLRead(Addr addr)
         assert(!cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
         assert(!cacheBlocks[block_index].dirty);
-        assert(!cacheBlocks[block_index].needsPreWBApply);
 
         assert(MSHR.size() <= numMSHREntries);
         assert(MSHR.find(block_index) != MSHR.end());
@@ -217,7 +235,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
         stats.numVertexReads++;
-        return true;
+        return ReadReturnStatus::ACCEPT;
     } else {
         // miss
         assert(cacheBlocks[block_index].addr != aligned_addr);
@@ -232,20 +250,37 @@ CoalesceEngine::recvWLRead(Addr addr)
             if (cacheBlocks[block_index].state == CacheState::IDLE) {
                 if (cacheBlocks[block_index].dirty) {
                     cacheBlocks[block_index].state = CacheState::PENDING_WB;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
                     memoryFunctionQueue.emplace_back(
                         [this] (int block_index, Tick schedule_tick) {
                             processNextWriteBack(block_index, schedule_tick);
                         }, block_index, curTick());
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
                 } else {
-                    // NOTE: move the cache block to invalid state
-                    // FIXME: Fix the issue below.
-                    // May need to activate tracking for this
+                    // NOTE: The cache block could still be active but
+                    // not dirty. If active we only have to active tracking
+                    // but can throw the data away.
+                    bool atom_active = false;
+                    for (int index = 0; index < numElementsPerLine; index++) {
+                        atom_active |= graphWorkload->activeCondition(
+                                        cacheBlocks[block_index].items[index]);
+                    }
+                    if (atom_active) {
+                        activeCacheBlocks.erase(block_index);
+                        directory->activate(cacheBlocks[block_index].addr);
+                    }
+                    // NOTE: Bring the cache line to invalid state.
+                    // NOTE: Above line where we set hasConflict to true
+                    // does not matter anymore since we reset the cache line.
                     cacheBlocks[block_index].reset();
                 }
+                return ReadReturnStatus::REJECT_NO_ROLL;
+            } else {
+                return ReadReturnStatus::REJECT_ROLL;
             }
-            // return int instead of bool to tell WLEngine to whether
-            // roll the first entry in the queue.
-            return false;
         } else {
             // cold miss
             assert(MSHR.find(block_index) == MSHR.end());
@@ -255,16 +290,21 @@ CoalesceEngine::recvWLRead(Addr addr)
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].dirty = false;
                 cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].needsPreWBApply = false;
                 cacheBlocks[block_index].state = CacheState::PENDING_DATA;
                 cacheBlocks[block_index].lastChangedTick = curTick();
+
+                MSHR[block_index].push_back(addr);
                 memoryFunctionQueue.emplace_back(
                     [this] (int block_index, Tick schedule_tick) {
                         processNextRead(block_index, schedule_tick);
                     }, block_index, curTick());
-                return true;
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
+                return ReadReturnStatus::ACCEPT;
             } else {
-                return false;
+                return ReadReturnStatus::REJECT_ROLL;
             }
         }
     }
@@ -276,116 +316,87 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     assert(pkt->isResponse());
     DPRINTF(CoalesceEngine,  "%s: Received packet: %s from memory.\n",
                                                 __func__, pkt->print());
+
+    onTheFlyReqs--;
     if (pkt->isWrite()) {
         DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__);
         delete pkt;
-        return true;
-    }
-
-    onTheFlyReqs--;
-    Addr addr = pkt->getAddr();
-    int block_index = getBlockIndex(addr);
-    WorkListItem* items = pkt->getPtr<WorkListItem>();
-
-    bool do_wb = false;
-    if (pkt->findNextSenderState<SenderState>()) {
-        assert(!((cacheBlocks[block_index].addr == addr) &&
-                (cacheBlocks[block_index].valid)));
-        // We have read the address to send the wl and it is not in the
-        // cache. Simply send the items to the PushEngine.
-
-        DPRINTF(CoalesceEngine, "%s: Received read response for pull read "
-                                "for addr %lu.\n", __func__, addr);
-        int it = getBitIndexBase(addr);
-        uint64_t send_mask = pendingVertexPullReads[addr];
-        // No applying of the line needed.
-        for (int i = 0; i < numElementsPerLine; i++) {
-            Addr vertex_addr = addr + i * sizeof(WorkListItem);
-            uint64_t vertex_send_mask = send_mask & (1 << i);
-            if (vertex_send_mask != 0) {
-                assert(needsPush[it + i] == 1);
-                needsPush[it + i] = 0;
-                _workCount--;
-
-                uint32_t delta;
-                bool do_push, do_wb_v;
-                std::tie(delta, do_push, do_wb_v) =
-                                        graphWorkload->prePushApply(items[i]);
-                do_wb |= do_wb_v;
-                if (do_push) {
-                    owner->recvVertexPush(vertex_addr, delta,
-                                        items[i].edgeIndex, items[i].degree);
-                } else {
-                    // TODO: Add a stat to count this.
-                    owner->recvPrevPullCorrection();
-                }
-                stats.verticesPushed++;
-                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
-            }
+    } else {
+        assert(pkt->isRead());
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+        ReadPurpose* purpose = pkt->findNextSenderState<ReadPurpose>();
+
+        // NOTE: Regardless of where the pkt will go we have to release the
+        // reserved space for this pkt in the activeBuffer in case
+        // it was read from memory for placement in the activeBuffer.
+        // NOTE: Also we have to stop tracking the address for pullAddrs
+        if (purpose->dest() == ReadDestination::READ_FOR_PUSH) {
+            pendingPullReads--;
+            pendingPullAddrs.erase(addr);
         }
-        pendingVertexPullReads.erase(addr);
-        maxPotentialPostPushWB--;
-    }
+        if (cacheBlocks[block_index].addr == addr) {
+            // If it is in the cache, line should be in PENDING_DATA state.
+            // Regardless of the purpose for which it was read, it should
+            // be placed in the cache array.
+            assert(cacheBlocks[block_index].busyMask == 0);
+            assert(!cacheBlocks[block_index].valid);
+            assert(!cacheBlocks[block_index].dirty);
+            assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
+
+            // NOTE: Since it is in PENDING_DATA state it
+            // should have an entry in the MSHR.
+            assert(MSHR.find(block_index) != MSHR.end());
+
+            pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
+                                                            peerMemoryAtomSize);
+
+            cacheBlocks[block_index].valid = true;
+            // HACK: In case the pkt was read for push but it was allocated
+            // for in the cache later on, we should cancel the future
+            // processNextRead for this block. We could set lastChangedTick
+            // to curTick() like usual. However, there is no way to ensure
+            // that processNextRead will be not be called on the same tick
+            // as the pkt arrives from the memory. Therefore, we will set
+            // the lastChangedTick to half a cycle before the actual time.
+            // We move that back in time because it would be fine if
+            // processNextRead happened before pkt arriveed. processNextRead
+            // actually will check if there is a pending read for push for
+            // the address it's trying to populate.
+            if (purpose->dest() == ReadDestination::READ_FOR_PUSH) {
+                cacheBlocks[block_index].lastChangedTick =
+                                    curTick() - (Tick) (clockPeriod() / 2);
+            } else {
+                cacheBlocks[block_index].lastChangedTick = curTick();
+            }
 
-    bool cache_wb = false;
-    if (cacheBlocks[block_index].addr == addr) {
-        DPRINTF(CoalesceEngine, "%s: Received read response to "
-                        "fill cacheBlocks[%d].\n", __func__, block_index);
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-        assert(!cacheBlocks[block_index].valid);
-        assert(cacheBlocks[block_index].busyMask == 0);
-        assert(!cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(cacheBlocks[block_index].pendingData);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(!cacheBlocks[block_index].pendingWB);
-        assert(MSHR.find(block_index) != MSHR.end());
-        std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize);
-        for (int i = 0; i < numElementsPerLine; i++) {
-            DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
-                __func__, block_index, i, graphWorkload->printWorkListItem(
-                                        cacheBlocks[block_index].items[i]));
-        }
-        cacheBlocks[block_index].valid = true;
-        cacheBlocks[block_index].needsWB |= do_wb;
-        cacheBlocks[block_index].pendingData = false;
-        // HACK: In case processNextRead is called on the same tick as curTick
-        // and is scheduled to read to the same cacheBlocks[block_index]
-        cacheBlocks[block_index].lastChangedTick =
-                                        curTick() - (Tick) (clockPeriod() / 2);
-        cache_wb = true;
-    } else if (do_wb) {
-        PacketPtr wb_pkt = createWritePacket(
-                                addr, peerMemoryAtomSize, (uint8_t*) items);
-        postPushWBQueue.emplace_back(wb_pkt, curTick());
-        memoryFunctionQueue.emplace_back(
-            [this] (int ignore, Tick schedule_tick) {
-                processNextPostPushWB(ignore, schedule_tick);
-            }, 0, curTick());
-        if ((!nextMemoryEvent.pending()) &&
-            (!nextMemoryEvent.scheduled())) {
-            schedule(nextMemoryEvent, nextCycle());
-        }
-    } else {
-        // TODO: Add a stat to count this.
-        // FIXME: This is not a totally wasteful read. e.g. all reads
-        // for pull in BFS are like this.
-        DPRINTF(CoalesceEngine, "%s: No write destination for addr: %lu.\n", __func__, addr);
-    }
+            // NOTE: If the atom is active we have to deactivate the tracking
+            // of this atom in the memory since it's not in memory anymore.
+            // Since it is going to the cache, cache will be responsible for
+            // tracking this. Push to activeCacheBlocks for simulator speed
+            // instead of having to search for active blocks in the cache.
+            bool atom_active = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active |= graphWorkload->activeCondition(
+                                            cacheBlocks[block_index].items[index]);
+            }
+            if (atom_active) {
+                directory->deactivate(addr);
+                activeCacheBlocks.push_back(block_index);
+            }
 
-    if (cache_wb) {
-        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
-            Addr miss_addr = *it;
-            Addr aligned_miss_addr =
-                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+            assert(MSHR.find(block_index) != MSHR.end());
+            for (auto it = MSHR[block_index].begin();
+                                            it != MSHR[block_index].end();) {
+                Addr miss_addr = *it;
+                Addr aligned_miss_addr =
+                            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
 
-            if (aligned_miss_addr == addr) {
+                assert(aligned_miss_addr == cacheBlocks[block_index].addr);
                 int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
                 DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
                             "cacheBlocks[%d] can be serviced with the received "
                             "packet.\n",__func__, miss_addr, block_index);
-                // TODO: Make this block of code into a function
                 responseQueue.push_back(std::make_tuple(miss_addr,
                         cacheBlocks[block_index].items[wl_offset], curTick()));
                 DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
@@ -400,32 +411,72 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                             graphWorkload->printWorkListItem(
                                 cacheBlocks[block_index].items[wl_offset]),
                             responseQueue.size());
-                // TODO: Add a stat to count the number of WLItems that have been touched.
                 cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-                // cacheBlocks[block_index].lastChangedTick = curTick();
                 DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                             block_index, cacheBlocks[block_index].to_string());
                 it = MSHR[block_index].erase(it);
+            }
+            MSHR.erase(block_index);
+
+            cacheBlocks[block_index].state = CacheState::BUSY;
+            if ((!nextResponseEvent.scheduled()) && (!responseQueue.empty())) {
+                schedule(nextResponseEvent, nextCycle());
+            }
+            delete pkt;
+        } else {
+            assert(purpose->dest() == ReadDestination::READ_FOR_PUSH);
+            // There should be enough room in activeBuffer to place this pkt.
+            // REMEMBER: If dest == READ_FOR_PUSH we release the reserved space.
+            // So at this point in code we should have at least one free entry
+            // in the active buffer which is reserved for this pkt.
+            assert(activeBuffer.size() + pendingPullReads < activeBufferSize);
+
+            WorkListItem items[numElementsPerLine];
+            pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active |= graphWorkload->activeCondition(items[index]);
+            }
+            if (atom_active) {
+                directory->deactivate(addr);
+                activeBuffer.emplace_back(pkt, curTick());
+                DPRINTF(MSDebug, "%s: Empalced pkt: %s in activeBuffer. "
+                        "activeBuffer.size: %d.\n", __func__,
+                        pkt->print(), activeBuffer.size());
             } else {
-                it++;
+                delete pkt;
+            }
+            // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
+            //     memoryFunctionQueue.emplace_back(
+            //         [this] (int ignore, Tick schedule_tick) {
+            //             processNextVertexPull(ignore, schedule_tick);
+            //         }, 0, curTick());
+            //     if ((!nextMemoryEvent.pending()) &&
+            //         (!nextMemoryEvent.scheduled())) {
+            //         schedule(nextMemoryEvent, nextCycle());
+            //     }
+            //     pullsScheduled++;
+            // }
+            if (pullCondition()) {
+                memoryFunctionQueue.emplace_back(
+                    [this] (int ignore, Tick schedule_tick) {
+                        processNextVertexPull(ignore, schedule_tick);
+                    }, 0, curTick());
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
+                pullsScheduled++;
             }
         }
     }
 
-    if (MSHR[block_index].empty()) {
-        MSHR.erase(block_index);
-    }
-
-    if ((!nextResponseEvent.scheduled()) &&
-        (!responseQueue.empty())) {
-        schedule(nextResponseEvent, nextCycle());
+    if (done()) {
+        owner->recvDoneSignal();
     }
-
-    delete pkt;
     return true;
 }
 
-// TODO: For loop to empty the entire responseQueue.
 void
 CoalesceEngine::processNextResponseEvent()
 {
@@ -450,8 +501,8 @@ CoalesceEngine::processNextResponseEvent()
                     addr_response);
 
         responseQueue.pop_front();
-        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
-                    "responseQueue.size = %d.\n", __func__,
+        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue."
+                    " responseQueue.size = %d.\n", __func__,
                     responseQueue.size());
         DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
                     "responseQueue.size = %d.\n", __func__,
@@ -491,27 +542,28 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
                 "with Addr: %lu.\n", __func__,
                 graphWorkload->printWorkListItem(wl), addr);
-    // Desing does not allow for write misses for now.
+
+    // NOTE: Design does not allow for write misses.
     assert(cacheBlocks[block_index].addr == aligned_addr);
     // cache state asserts
-    assert(cacheBlocks[block_index].valid);
     assert(cacheBlocks[block_index].busyMask != 0);
-    assert(!cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingApply);
-    assert(!cacheBlocks[block_index].pendingWB);
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].state == CacheState::BUSY);
 
     // respective bit in busyMask for wl is set.
     assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
 
     if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
-        cacheBlocks[block_index].needsWB |= true;
-        stats.numVertexWrites++;
+        cacheBlocks[block_index].dirty |= true;
     }
     cacheBlocks[block_index].items[wl_offset] = wl;
-    if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) {
-        cacheBlocks[block_index].needsApply |= true;
-        cacheBlocks[block_index].needsWB |= true;
+    if ((graphWorkload->activeCondition(cacheBlocks[block_index].items[wl_offset])) &&
+        (!activeCacheBlocks.find(block_index))) {
+        activeCacheBlocks.push_back(block_index);
+        if (!owner->running()) {
+            owner->start();
+        }
     }
 
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
@@ -523,188 +575,40 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
 
-    // TODO: Make this more general and programmable.
-    if ((cacheBlocks[block_index].busyMask == 0)) {
-        if (cacheBlocks[block_index].needsApply) {
-            cacheBlocks[block_index].pendingApply = true;
-            cacheBlocks[block_index].lastChangedTick = curTick();
-            applyQueue.push_back(block_index);
-            DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
-                            "applyQueue.\n", __func__, block_index);
-            if ((!applyQueue.empty()) &&
-                (!nextPreWBApplyEvent.scheduled())) {
-                schedule(nextPreWBApplyEvent, nextCycle());
-            }
-        } else {
-            assert(MSHR.size() <= numMSHREntries);
-            // cache line has conflict.
-            if (MSHR.find(block_index) != MSHR.end()) {
-                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
-                                    "conflict.\n", __func__, block_index);
-                if (cacheBlocks[block_index].needsWB) {
-                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write"
-                                            " back.\n", __func__, block_index);
-                    cacheBlocks[block_index].pendingWB = true;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    memoryFunctionQueue.emplace_back(
-                        [this] (int block_index, Tick schedule_tick) {
-                            processNextWriteBack(block_index, schedule_tick);
-                        }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack "
-                                    "for input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                } else {
-                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does not need"
-                                    " a write back.\n", __func__, block_index);
-                    Addr miss_addr = MSHR[block_index].front();
-                    Addr aligned_miss_addr =
-                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-                    DPRINTF(CoalesceEngine, "%s: First conflicting address for"
-                        " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
-                        __func__, block_index, miss_addr, aligned_miss_addr);
-                    cacheBlocks[block_index].addr = aligned_miss_addr;
-                    cacheBlocks[block_index].valid = false;
-                    cacheBlocks[block_index].busyMask = 0;
-                    cacheBlocks[block_index].needsWB = false;
-                    cacheBlocks[block_index].needsApply = false;
-                    cacheBlocks[block_index].pendingData = true;
-                    cacheBlocks[block_index].pendingApply = false;
-                    cacheBlocks[block_index].pendingWB = false;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    memoryFunctionQueue.emplace_back(
-                        [this] (int block_index, Tick schedule_tick) {
-                            processNextRead(block_index, schedule_tick);
-                        }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead "
-                                    "for input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                }
-            } else {
-                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                        "idle state now.\n", __func__, block_index);
-            }
-        }
-    }
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                block_index, cacheBlocks[block_index].to_string());
-
-}
-
-void
-CoalesceEngine::processNextPreWBApplyEvent()
-{
-    int block_index = preWBApplyQueue.front();
-    DPRINTF(CoalesceEngine, "%s: Looking at the front of the preWBApplyQueue. "
-                "cacheBlock[%d] to be applied.\n", __func__, block_index);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-            __func__, block_index, cacheBlocks[block_index].to_string());
-
-    if (cacheBlocks[block_index].state == CacheState::PENDING_PRE_WB_APPLY) {
-        assert(cacheBlocks[block_index].busyMask == 0);
-        assert(cacheBlocks[block_index].valid);
-        assert(cacheBlocks[block_index].needsPreWBApply);
-        bool block_active = false;
-        for (int index = 0; index < numElementsPerLine; index++) {
-            bool active = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
-            block_active |= active;
-            if (active) {
-                // cacheWorkCount++;
-                // FUTUREME: When pulling from activeCacheBlocks, in case we
-                // face a block that is not in idle state, we basically pop
-                // that entry and push it to the back. We only delete entries
-                // in this buffer if pushed or evicted.
-                activeCacheBlocks.push_back(block_index);
-            }
-        }
-        if (block_active && !owner->running()) {
-            owner->start();
-        }
-
-        cacheBlocks[block_index].needsPreWBApply = false;
+    if (cacheBlocks[block_index].busyMask == 0) {
         if (cacheBlocks[block_index].hasConflict) {
             if (cacheBlocks[block_index].dirty) {
+                cacheBlocks[block_index].state = CacheState::PENDING_WB;
+                cacheBlocks[block_index].lastChangedTick = curTick();
                 memoryFunctionQueue.emplace_back(
                     [this] (int block_index, Tick schedule_tick) {
                         processNextWriteBack(block_index, schedule_tick);
                     }, block_index, curTick());
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
             } else {
-                // FIXME: Solve below issue.
-                // Not dirty but could be active still.
-                // need to activate tracking
+                bool atom_active = false;
+                for (int index = 0; index < numElementsPerLine; index++) {
+                    atom_active |= graphWorkload->activeCondition(
+                                        cacheBlocks[block_index].items[index]);
+                }
+                if (atom_active) {
+                    activeCacheBlocks.erase(block_index);
+                    directory->activate(cacheBlocks[block_index].addr);
+                }
                 cacheBlocks[block_index].reset();
             }
         } else {
             cacheBlocks[block_index].state = CacheState::IDLE;
-        }
-        cacheBlocks[block_index].lastChangedTick = curTick();
-    } else {
-
-    }
-
-    if (cacheBlocks[block_index].pendingApply) {
-        assert(cacheBlocks[block_index].busyMask == 0);
-        for (int index = 0; index < numElementsPerLine; index++) {
-            bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
-            if (do_push) {
-                int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
-                if (needsPush[bit_index_base + index] == 0) {
-                    needsPush[bit_index_base + index] = 1;
-                    _workCount++;
-                    activeBits.push_back(bit_index_base + index);
-                    if (!owner->running()) {
-                        owner->start();
-                    }
-                }
-            }
-        }
-        stats.bitvectorLength.sample(needsPush.count());
-
-        assert(cacheBlocks[block_index].needsWB);
-        cacheBlocks[block_index].needsApply = false;
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
-
-        assert(MSHR.size() <= numMSHREntries);
-        if (MSHR.find(block_index) != MSHR.end()) {
-            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
-                                "conflicts.\n", __func__, block_index);
-            cacheBlocks[block_index].pendingWB = true;
             cacheBlocks[block_index].lastChangedTick = curTick();
-            memoryFunctionQueue.emplace_back(
-                [this] (int block_index, Tick schedule_tick) {
-                processNextWriteBack(block_index, schedule_tick);
-            }, block_index, curTick());
-            DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input"
-                    " %d to memoryFunctionQueue.\n", __func__, block_index);
-            if ((!nextMemoryEvent.pending()) &&
-                (!nextMemoryEvent.scheduled())) {
-                schedule(nextMemoryEvent, nextCycle());
-            }
-        } else {
-            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                    "idle state now.\n", __func__, block_index);
         }
-        DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__,
-                    block_index, cacheBlocks[block_index].to_string());
-    } else {
-        stats.numInvalidApplies++;
-    }
-
-    applyQueue.pop_front();
-    if ((!applyQueue.empty()) &&
-        (!nextPreWBApplyEvent.scheduled())) {
-        schedule(nextPreWBApplyEvent, nextCycle());
     }
-
-    if (done()) {
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    stats.numVertexWrites++;
+    if ((cacheBlocks[block_index].state == CacheState::IDLE) && done()) {
         owner->recvDoneSignal();
     }
 }
@@ -740,6 +644,10 @@ CoalesceEngine::processNextMemoryEvent()
     if ((!memoryFunctionQueue.empty())) {
         schedule(nextMemoryEvent, nextCycle());
     }
+
+    if (done()) {
+        owner->recvDoneSignal();
+    }
 }
 
 void
@@ -759,36 +667,68 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
     assert(cacheBlocks[block_index].busyMask == 0);
     assert(!cacheBlocks[block_index].valid);
     assert(!cacheBlocks[block_index].dirty);
-    assert(!cacheBlocks[block_index].needsPreWBApply);
     assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
 
     bool need_send_pkt = true;
 
     // NOTE: Search postPushWBQueue
-    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
+    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end();)
     {
         PacketPtr wb_pkt = std::get<0>(*wb);
-        if (cacheBlocks[block_index].addr = wb_pkt->getAddr()) {
+        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
             wb_pkt->writeDataToBlock(
                 (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+            cacheBlocks[block_index].valid = true;
             cacheBlocks[block_index].dirty = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+
             need_send_pkt = false;
-            postPushWBQueue.erase(wb);
+            wb = postPushWBQueue.erase(wb);
+            delete wb_pkt;
+            DPRINTF(MSDebug, "%s: Found addr: %lu in postPushWBQueue. "
+                        "postPushWBQueue.size: %d.\n", __func__,
+                        cacheBlocks[block_index].addr, postPushWBQueue.size());
+        } else {
+            wb++;
         }
     }
-    for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) {
+    // NOTE: Search activeBuffer
+    for (auto ab = activeBuffer.begin(); ab != activeBuffer.end();) {
         PacketPtr ab_pkt = std::get<0>(*ab);
-        if (cacheBlocks[block_index].addr = ab_pkt->getAddr()) {
+        if (cacheBlocks[block_index].addr == ab_pkt->getAddr()) {
             ab_pkt->writeDataToBlock(
                 (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+
+            cacheBlocks[block_index].valid = true;
+            cacheBlocks[block_index].dirty = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            activeCacheBlocks.push_back(block_index);
+
             need_send_pkt = false;
-            activeBuffer.erase(ab);
+            ab = activeBuffer.erase(ab);
+            delete ab_pkt;
+            // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
+            //     memoryFunctionQueue.emplace_back(
+            //         [this] (int ignore, Tick schedule_tick) {
+            //             processNextVertexPull(ignore, schedule_tick);
+            //         }, 0, curTick());
+            //     pullsScheduled++;
+            // }
+            DPRINTF(MSDebug, "%s: Found addr: %lu in activeBuffer. "
+                        "activeBuffer.size: %d.\n", __func__,
+                        cacheBlocks[block_index].addr, activeBuffer.size());
+            if (pullCondition()) {
+                memoryFunctionQueue.emplace_back(
+                    [this] (int ignore, Tick schedule_tick) {
+                        processNextVertexPull(ignore, schedule_tick);
+                    }, 0, curTick());
+                pullsScheduled++;
+            }
+        } else {
+            ab++;
         }
     }
     if (!need_send_pkt) {
-        cacheBlocks[block_index].valid = true;
-        cacheBlocks[block_index].needsPreWBApply = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
         for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
             Addr miss_addr = *it;
             Addr aligned_miss_addr =
@@ -828,14 +768,16 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
         cacheBlocks[block_index].state = CacheState::BUSY;
     }
 
-    if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) !=
-                                                pendingVertexPullReads.end()) {
+    if (pendingPullAddrs.find(cacheBlocks[block_index].addr) !=
+                                            pendingPullAddrs.end()) {
         need_send_pkt = false;
     }
 
     if (need_send_pkt) {
         PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
                                         peerMemoryAtomSize);
+        ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_CACHE);
+        pkt->pushSenderState(purpose);
         DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
                 "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
         memPort.sendPacket(pkt);
@@ -852,25 +794,24 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                 block_index, cacheBlocks[block_index].to_string());
 
     if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
-        assert(cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
+        assert(cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].dirty);
         assert(cacheBlocks[block_index].hasConflict);
-        assert(!cacheBlocks[block_index].needsPreWBApply);
         assert(cacheBlocks[block_index].state == CacheState::PENDING_WB);
 
-        Addr base_addr = cacheBlocks[block_index].addr;
+        // NOTE: If the atom we're writing back is active, we have to
+        // stop tracking it in the cache and start tracking it in the memory.
+        bool atom_active = false;
         for (int index = 0; index < numElementsPerLine; index++) {
-            if (cacheBlocks[block_index].items[index].active) {
-                Addr vertex_addr = base_addr + index * sizeof(WorkListItem);
-                // NOTE: Implement this
-                // workdir.activate()
-                // cacheWorkCount--;
-            }
+            atom_active |= graphWorkload->activeCondition(
+                                        cacheBlocks[block_index].items[index]);
         }
-        if (activeCacheBlocks.find(block_index)) {
+        if (atom_active) {
             activeCacheBlocks.erase(block_index);
+            directory->activate(cacheBlocks[block_index].addr);
         }
+
         PacketPtr pkt = createWritePacket(
                 cacheBlocks[block_index].addr, peerMemoryAtomSize,
                 (uint8_t*) cacheBlocks[block_index].items);
@@ -878,9 +819,8 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                         "Addr: %lu, size = %d.\n", __func__,
                         pkt->getAddr(), pkt->getSize());
         memPort.sendPacket(pkt);
+        onTheFlyReqs++;
         cacheBlocks[block_index].reset();
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
-                " %d to memoryFunctionQueue.\n", __func__, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
     } else {
@@ -896,94 +836,54 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
 void
 CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
 {
+    if (postPushWBQueue.empty()) {
+        return;
+    }
     PacketPtr wb_pkt;
     Tick pkt_tick;
     std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
     if (schedule_tick == pkt_tick) {
         memPort.sendPacket(wb_pkt);
+        onTheFlyReqs++;
         postPushWBQueue.pop_front();
+        DPRINTF(MSDebug, "%s: Popped pkt: %s from postPushWBQueue. "
+                        "postPushWBQueue.size: %d.\n", __func__,
+                        wb_pkt->print(), postPushWBQueue.size());
     }
 }
 
 void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
-    WorkLocation bit_status;
-    Addr location;
-    int offset;
-
-    std::tie(bit_status, location, offset) = getOptimalPullAddr();
-
-    if (bit_status != WorkLocation::GARBAGE) {
-        if (bit_status == WorkLocation::PENDING_READ) {
-            // renaming the outputs to thier local names.
-            Addr addr = location;
-            int index_offset = offset;
-
-            uint64_t send_mask = pendingVertexPullReads[addr];
-            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
-            assert(vertex_send_mask == 0);
-            send_mask |= (1 << index_offset);
-            pendingVertexPullReads[addr] = send_mask;
-            numPullsReceived--;
+    pullsScheduled--;
+    if (!directory->empty()) {
+        Addr addr = directory->getNextWork();
+        int block_index = getBlockIndex(addr);
+
+        bool in_cache = cacheBlocks[block_index].addr == addr;
+        bool in_active_buffer = false;
+        for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) {
+            PacketPtr pkt = std::get<0>(*ab);
+            in_active_buffer |= (pkt->getAddr() == addr);
         }
-        if (bit_status == WorkLocation::IN_CACHE) {
-            // renaming the outputs to their local names.
-            int block_index = (int) location;
-            int wl_offset = offset;
-
-            Addr addr = cacheBlocks[block_index].addr;
-            Addr vertex_addr = addr + (wl_offset * sizeof(WorkListItem));
-            int slice_base_index = getBitIndexBase(addr);
-
-            needsPush[slice_base_index + wl_offset] = 0;
-            _workCount--;
-
-            uint32_t delta;
-            bool do_push, do_wb;
-            std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
-                                    cacheBlocks[block_index].items[wl_offset]);
-            cacheBlocks[block_index].needsWB |= do_wb;
-            if (do_push) {
-                owner->recvVertexPush(vertex_addr, delta,
-                        cacheBlocks[block_index].items[wl_offset].edgeIndex,
-                        cacheBlocks[block_index].items[wl_offset].degree);
-            } else {
-                DPRINTF(CoalesceEngine, "%s: Fuck!.\n", __func__);
-                owner->recvPrevPullCorrection();
-            }
-            stats.verticesPushed++;
-            stats.lastVertexPushTime = curTick() - stats.lastResetTick;
-            numPullsReceived--;
+        bool in_write_buffer = false;
+        for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
+        {
+            PacketPtr pkt = std::get<0>(*wb);
+            in_write_buffer |= (pkt->getAddr() == addr);
         }
-        if (bit_status == WorkLocation::IN_MEMORY) {
-            if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) {
-                Addr addr = location;
-                int index_offset = offset;
-                uint64_t send_mask = (1 << index_offset);
-                assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
-                PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
-                SenderState* sender_state = new SenderState(true);
-                pkt->pushSenderState(sender_state);
-                memPort.sendPacket(pkt);
-                onTheFlyReqs++;
-                maxPotentialPostPushWB++;
-                pendingVertexPullReads[addr] = send_mask;
-                numPullsReceived--;
-            }
+        bool repeat_work = pendingPullAddrs.find(addr) != pendingPullAddrs.end();
+
+        if (!in_cache && !in_active_buffer && !in_write_buffer && !repeat_work) {
+            PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+            ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_PUSH);
+            pkt->pushSenderState(purpose);
+            memPort.sendPacket(pkt);
+            onTheFlyReqs++;
+            pendingPullReads++;
+            pendingPullAddrs.insert(addr);
         }
     }
-
-    stats.bitvectorSearchStatus[bit_status]++;
-
-    if (numPullsReceived > 0) {
-        memoryFunctionQueue.emplace_back(
-            [this] (int slice_base, Tick schedule_tick) {
-            processNextVertexPull(slice_base, schedule_tick);
-        }, 0, curTick());
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
-                                    "0 to memoryFunctionQueue.\n", __func__);
-    }
 }
 
 void
@@ -1000,26 +900,149 @@ CoalesceEngine::recvMemRetry()
     schedule(nextMemoryEvent, nextCycle());
 }
 
+int
+CoalesceEngine::workCount()
+{
+    return activeCacheBlocks.size() +
+            directory->workCount() + activeBuffer.size();
+}
+
 void
 CoalesceEngine::recvVertexPull()
 {
-    bool should_schedule = (numPullsReceived == 0);
-    numPullsReceived++;
+    pullsReceived++;
+    DPRINTF(CoalesceEngine, "%s: Received a vertex pull. pullsReceived: %d.\n", __func__, pullsReceived);
 
     stats.verticesPulled++;
     stats.lastVertexPullTime = curTick() - stats.lastResetTick;
-    if (should_schedule) {
+    if (!nextApplyEvent.scheduled()) {
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextApplyEvent()
+{
+    if ((!activeBuffer.empty()) &&
+        (postPushWBQueue.size() < postPushWBQueueSize)) {
+        PacketPtr pkt;
+        Tick entrance_tick;
+        WorkListItem items[numElementsPerLine];
+
+        std::tie(pkt, entrance_tick) = activeBuffer.front();
+        pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+
+        for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
+            if (graphWorkload->activeCondition(items[index])) {
+                Addr addr = pkt->getAddr() + index * sizeof(WorkListItem);
+                uint32_t delta = graphWorkload->apply(items[index]);
+                owner->recvVertexPush(addr, delta, items[index].edgeIndex,
+                                                    items[index].degree);
+                pullsReceived--;
+            }
+        }
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
+
+        bool atom_active = false;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            atom_active |= graphWorkload->activeCondition(items[index]);
+        }
+        // NOTE: If the atom is not active anymore.
+        if (!atom_active) {
+            PacketPtr wb_pkt = createWritePacket(pkt->getAddr(),
+                                        peerMemoryAtomSize, (uint8_t*) items);
+            postPushWBQueue.emplace_back(wb_pkt, curTick());
+            DPRINTF(MSDebug, "%s: Empalced pkt: %s in postPushWBQueue. "
+                            "postPushWBQueue.size: %d.\n", __func__,
+                            wb_pkt->print(), postPushWBQueue.size());
+            activeBuffer.pop_front();
+            DPRINTF(MSDebug, "%s: Popped pkt: %s from activeBuffer. "
+                        "activeBuffer.size: %d.\n", __func__,
+                        pkt->print(), activeBuffer.size());
+            memoryFunctionQueue.emplace_back(
+                [this] (int ignore, Tick schedule_tick) {
+                    processNextPostPushWB(ignore, schedule_tick);
+                }, 0, curTick());
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+            delete pkt;
+        }
+    } else if (!activeCacheBlocks.empty()) {
+        int num_visited_indices = 0;
+        int initial_fifo_length = activeCacheBlocks.size();
+        while (true) {
+            int block_index = activeCacheBlocks.front();
+            if (cacheBlocks[block_index].state == CacheState::IDLE) {
+                for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
+                    if (graphWorkload->activeCondition(cacheBlocks[block_index].items[index])) {
+                        Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem);
+                        uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]);
+                        cacheBlocks[block_index].dirty = true;
+                        owner->recvVertexPush(addr, delta,
+                            cacheBlocks[block_index].items[index].edgeIndex,
+                            cacheBlocks[block_index].items[index].degree);
+                        pullsReceived--;
+                    }
+                }
+
+                bool atom_active = false;
+                for (int index = 0; index < numElementsPerLine; index++) {
+                    atom_active |= graphWorkload->activeCondition(cacheBlocks[block_index].items[index]);
+                }
+                // NOTE: If we have reached the last item in the cache block
+                if (!atom_active) {
+                    activeCacheBlocks.erase(block_index);
+                }
+                break;
+            }
+            // NOTE: If the block with index at the front of activeCacheBlocks
+            // is not in IDLE state, then roll the that index to the back
+            activeCacheBlocks.pop_front();
+            activeCacheBlocks.push_back(block_index);
+            // NOTE: If we have visited all the items initially in the FIFO.
+            num_visited_indices++;
+            if (num_visited_indices == initial_fifo_length) {
+                break;
+            }
+        }
+    } else {
+        DPRINTF(CoalesceEngine, "%s: Could not find "
+                        "work to apply.\n", __func__);
+    }
+
+    // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
+    //     memoryFunctionQueue.emplace_back(
+    //         [this] (int ignore, Tick schedule_tick) {
+    //             processNextVertexPull(ignore, schedule_tick);
+    //         }, 0, curTick());
+    //     if ((!nextMemoryEvent.pending()) &&
+    //         (!nextMemoryEvent.scheduled())) {
+    //         schedule(nextMemoryEvent, nextCycle());
+    //     }
+    //     pullsScheduled++;
+    // }
+    if (pullCondition()) {
         memoryFunctionQueue.emplace_back(
-            [this] (int slice_base, Tick schedule_tick) {
-            processNextVertexPull(slice_base, schedule_tick);
-        }, 0, curTick());
+            [this] (int ignore, Tick schedule_tick) {
+                processNextVertexPull(ignore, schedule_tick);
+            }, 0, curTick());
         if ((!nextMemoryEvent.pending()) &&
             (!nextMemoryEvent.scheduled())) {
             schedule(nextMemoryEvent, nextCycle());
         }
+        pullsScheduled++;
+    }
+
+    if ((pullsReceived > 0) && (!nextApplyEvent.scheduled())) {
+        schedule(nextApplyEvent, nextCycle());
     }
 }
 
+
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
@@ -1036,16 +1059,11 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache hit under misses."),
     ADD_STAT(mshrEntryShortage, statistics::units::Count::get(),
              "Number of cache rejections caused by entry shortage."),
-    ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
-             "Number of cache rejections caused by target shortage."),
     ADD_STAT(responsePortShortage, statistics::units::Count::get(),
              "Number of times a response has been "
              "delayed because of port shortage. "),
     ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
              "Number of times memory bandwidth was not available."),
-    ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
-             "Number of times a memory block has been read twice. "
-             "Once for push and once to populate the cache."),
     ADD_STAT(verticesPulled, statistics::units::Count::get(),
              "Number of times a pull request has been sent by PushEngine."),
     ADD_STAT(verticesPushed, statistics::units::Count::get(),
@@ -1054,13 +1072,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Time of the last pull request. (Relative to reset_stats)"),
     ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
              "Time of the last vertex push. (Relative to reset_stats)"),
-    ADD_STAT(numInvalidApplies, statistics::units::Count::get(),
-             "Number of times a line has become busy"
-             " while waiting to be applied."),
     ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(),
              "Number of times a scheduled memory function has been invalid."),
-    ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
-             "Distribution for the location of vertex searches."),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
     ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
@@ -1083,12 +1096,6 @@ CoalesceEngine::CoalesceStats::regStats()
 {
     using namespace statistics;
 
-    bitvectorSearchStatus.init(NUM_STATUS);
-    bitvectorSearchStatus.subname(0, "PENDING_READ");
-    bitvectorSearchStatus.subname(1, "IN_CACHE");
-    bitvectorSearchStatus.subname(2, "IN_MEMORY");
-    bitvectorSearchStatus.subname(3, "GARBAGE");
-
     hitRate = (readHits + readHitUnderMisses) /
                 (readHits + readHitUnderMisses + readMisses);
 
@@ -1096,7 +1103,6 @@ CoalesceEngine::CoalesceStats::regStats()
 
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
 
-    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
     bitvectorLength.init(64);
     responseQueueLatency.init(64);
     memoryFunctionLatency.init(64);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 8da67c7b43..c457b214f9 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -29,83 +29,20 @@
 #ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
-#include <bitset>
-
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/sega/base_memory_engine.hh"
+#include "accl/graph/sega/enums.hh"
+#include "accl/graph/sega/work_directory.hh"
 #include "base/cprintf.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
-
-
 namespace gem5
 {
 
-enum WorkLocation
-{
-    PENDING_READ,
-    IN_CACHE,
-    IN_MEMORY,
-    GARBAGE,
-    NUM_STATUS
-};
-
-enum CacheState
-{
-    INVALID,
-    PENDING_DATA,
-    BUSY,
-    IDLE,
-    PENDING_PRE_WB_APPLY,
-    PENDING_WB,
-    PENDING_PRE_PUSH_APPLY,
-    NUM_CACHE_STATE
-};
-
-const char* cacheStateStrings[NUM_CACHE_STATE] = {
-    "INVALID",
-    "PENDING_DATA",
-    "BUSY",
-    "IDLE",
-    "PENDING_PRE_WB_APPLY",
-    "PENDING_WB",
-    "PENDING_PRE_PUSH_APPLY"
-};
-
-enum ReadDestination
-{
-    READ_FOR_CACHE,
-    READ_FOR_PUSH
-};
-
 class MPU;
 
-
-// TODO: Add active bit to WorkListItem class. Check active bit before activate
-// Only activate if necessary and not active before.
-class WorkDirectory
-{
-  private:
-    Addr memoryAtomSize;
-    int atomBlockSize;
-    size_t elementSize;
-
-    int _workCount;
-  public:
-    AddrRange memoryRange;
-    WorkDirectory(Addr atom_size, int block_size, size_t element_size):
-        memoryAtomSize(atom_size), atomBlockSize(block_size),
-        elementSize(element_size), _workCount(0)
-    {}
-
-    void activate(Addr addr);
-    void deactivate(Addr addr);
-    int workCount();
-    std::tuple<WorkLocation, Addr> getNextWork();
-};
-
 class CoalesceEngine : public BaseMemoryEngine
 {
   private:
@@ -117,7 +54,6 @@ class CoalesceEngine : public BaseMemoryEngine
         bool valid;
         bool dirty;
         bool hasConflict;
-        bool needsPreWBApply;
         CacheState state;
         Tick lastChangedTick;
         Block() {}
@@ -127,7 +63,6 @@ class CoalesceEngine : public BaseMemoryEngine
           valid(false),
           dirty(false),
           hasConflict(false),
-          needsPreWBApply(false),
           state(CacheState::INVALID),
           lastChangedTick(0)
         {
@@ -140,18 +75,15 @@ class CoalesceEngine : public BaseMemoryEngine
             valid = false;
             dirty = false;
             hasConflict = false;
-            needsPreWBApply = false;
             state = CacheState::INVALID;
             lastChangedTick = 0;
         }
 
         std::string to_string() {
             return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
-                "dirty: %s, hasConflict: %s, needsPreWBApply: %s"
-                "state: %s, lastChangedTick: %lu}", addr, busyMask,
-                valid ? "true" : "false", dirty ? "true" : "false",
-                hasConflict ? "true" : "false",
-                needsPreWBApply ? "true" : "false",
+                "dirty: %s, hasConflict: %s, state: %s, lastChangedTick: %lu}",
+                addr, busyMask, valid ? "true" : "false",
+                dirty ? "true" : "false", hasConflict ? "true" : "false",
                 cacheStateStrings[state], lastChangedTick);
         }
     };
@@ -164,8 +96,11 @@ class CoalesceEngine : public BaseMemoryEngine
     };
 
     MPU* owner;
+    WorkDirectory* directory;
     GraphWorkload* graphWorkload;
 
+    Addr lastAtomAddr;
+
     int numLines;
     int numElementsPerLine;
     Block* cacheBlocks;
@@ -179,26 +114,26 @@ class CoalesceEngine : public BaseMemoryEngine
     std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
     // Tracking work in cache
-    int cacheWorkCount;
-    int numPullsReceived;
-    UniqueFIFO<int> preWBApplyQueue;
+    int pullsReceived;
     // NOTE: Remember to erase from this upon eviction from cache
     UniqueFIFO<int> activeCacheBlocks;
 
+    int pullsScheduled;
     int pendingPullReads;
     // A map from addr to sendMask. sendMask determines which bytes to
     // send for push when getting the read response from memory.
-    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
+    std::unordered_set<Addr> pendingPullAddrs;
 
     int activeBufferSize;
     int postPushWBQueueSize;
     std::deque<std::tuple<PacketPtr, Tick>> activeBuffer;
     std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
 
+    bool timeToPull();
+    bool canSchedulePull();
+    bool workLeftInMem();
+    bool pullCondition();
     int getBlockIndex(Addr addr);
-    // TODO: Should be moved to WorkDirectory
-    int getBitIndexBase(Addr addr);
-    Addr getBlockAddrFromBitIndex(int index);
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
@@ -212,11 +147,8 @@ class CoalesceEngine : public BaseMemoryEngine
     EventFunctionWrapper nextResponseEvent;
     void processNextResponseEvent();
 
-    EventFunctionWrapper nextPreWBApplyEvent;
-    void processNextPreWBApplyEvent();
-
-    EventFunctionWrapper nextPrePushApplyEvent;
-    void processNextPrePushApplyEvent();
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
 
     struct CoalesceStats : public statistics::Group
     {
@@ -236,19 +168,14 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar readMisses;
         statistics::Scalar readHitUnderMisses;
         statistics::Scalar mshrEntryShortage;
-        statistics::Scalar mshrTargetShortage;
         statistics::Scalar responsePortShortage;
         statistics::Scalar numMemoryBlocks;
-        statistics::Scalar numDoubleMemReads;
         statistics::Scalar verticesPulled;
         statistics::Scalar verticesPushed;
         statistics::Scalar lastVertexPullTime;
         statistics::Scalar lastVertexPushTime;
-        statistics::Scalar numInvalidApplies;
         statistics::Scalar numInvalidWriteBacks;
 
-        statistics::Vector bitvectorSearchStatus;
-
         statistics::Formula hitRate;
         statistics::Formula vertexPullBW;
         statistics::Formula vertexPushBW;
@@ -272,12 +199,14 @@ class CoalesceEngine : public BaseMemoryEngine
     void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
     virtual void recvFunctional(PacketPtr pkt);
 
-    bool recvWLRead(Addr addr);
+    void postMemInitSetup();
+
+    void createPopCountDirectory(int atoms_per_block);
+
+    ReadReturnStatus recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
-    // FIXME: Update this to return sum of cacheWorkCount and WorkDirectory
-    // workcount.
-    int workCount() { return _workCount; }
+    int workCount();
     void recvVertexPull();
 
     bool done();
diff --git a/src/accl/graph/sega/coalesce_engine_bak.cc b/src/accl/graph/sega/coalesce_engine_bak.cc
deleted file mode 100644
index 7a064c1c2f..0000000000
--- a/src/accl/graph/sega/coalesce_engine_bak.cc
+++ /dev/null
@@ -1,1308 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/coalesce_engine.hh"
-
-#include <bitset>
-
-#include "accl/graph/sega/mpu.hh"
-#include "base/intmath.hh"
-#include "debug/CacheBlockState.hh"
-#include "debug/CoalesceEngine.hh"
-#include "debug/SEGAStructureSize.hh"
-#include "mem/packet_access.hh"
-#include "sim/sim_exit.hh"
-
-namespace gem5
-{
-
-CoalesceEngine::CoalesceEngine(const Params &params):
-    BaseMemoryEngine(params),
-    numLines((int) (params.cache_size / peerMemoryAtomSize)),
-    numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
-    numTgtsPerMSHR(params.num_tgts_per_mshr),
-    maxRespPerCycle(params.max_resp_per_cycle), _workCount(0),
-    numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size),
-    maxPotentialPostPushWB(0),
-    nextMemoryEvent([this] {
-        processNextMemoryEvent();
-        }, name() + ".nextMemoryEvent"),
-    nextResponseEvent([this] {
-        processNextResponseEvent();
-        }, name() + ".nextResponseEvent"),
-    nextPreWBApplyEvent([this] {
-        processNextPreWBApplyEvent();
-        }, name() + ".nextPreWBApplyEvent"),
-    stats(*this)
-{
-    assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
-    cacheBlocks = new Block [numLines];
-    for (int i = 0; i < numLines; i++) {
-        cacheBlocks[i] = Block(numElementsPerLine);
-    }
-    needsPush.reset();
-}
-
-void
-CoalesceEngine::registerMPU(MPU* mpu)
-{
-    owner = mpu;
-}
-
-void
-CoalesceEngine::recvFunctional(PacketPtr pkt)
-{
-    if (pkt->isRead()) {
-        assert(pkt->getSize() == peerMemoryAtomSize);
-        Addr addr = pkt->getAddr();
-        int block_index = getBlockIndex(addr);
-
-        if ((cacheBlocks[block_index].addr == addr) &&
-            (cacheBlocks[block_index].valid)) {
-            assert(cacheBlocks[block_index].busyMask == 0);
-            assert(!cacheBlocks[block_index].needsApply);
-            // NOTE: No need to check needsWB because there might be entries
-            // that have been updated and not written back in the cache.
-            // assert(!cacheBlocks[block_index].needsWB);
-            assert(!cacheBlocks[block_index].pendingApply);
-            assert(!cacheBlocks[block_index].pendingWB);
-
-            pkt->makeResponse();
-            pkt->setDataFromBlock(
-                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
-        } else {
-            memPort.sendFunctional(pkt);
-        }
-    } else {
-        // TODO: Add and implement init function for GraphWorkload.
-        int bit_index_base = getBitIndexBase(pkt->getAddr());
-        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount);
-        memPort.sendFunctional(pkt);
-    }
-}
-
-bool
-CoalesceEngine::done()
-{
-    return applyQueue.empty() && needsPush.none() &&
-        memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
-}
-
-// addr should be aligned to peerMemoryAtomSize
-int
-CoalesceEngine::getBlockIndex(Addr addr)
-{
-    assert((addr % peerMemoryAtomSize) == 0);
-    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
-    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
-}
-
-// addr should be aligned to peerMemoryAtomSize
-int
-CoalesceEngine::getBitIndexBase(Addr addr)
-{
-    assert((addr % peerMemoryAtomSize) == 0);
-    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
-    int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
-    int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
-    return atom_index * block_bits;
-}
-
-// index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
-Addr
-CoalesceEngine::getBlockAddrFromBitIndex(int index)
-{
-    assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
-    Addr trimmed_addr = index * sizeof(WorkListItem);
-    return peerMemoryRange.addIntlvBits(trimmed_addr);
-}
-
-bool
-CoalesceEngine::recvWLRead(Addr addr)
-{
-    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
-    assert(aligned_addr % peerMemoryAtomSize == 0);
-    int block_index = getBlockIndex(aligned_addr);
-    assert(block_index < numLines);
-    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
-    assert(wl_offset < numElementsPerLine);
-    DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
-                        "This request maps to cacheBlocks[%d], aligned_addr: "
-                        "%lu, and wl_offset: %d.\n", __func__, addr,
-                        block_index, aligned_addr, wl_offset);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-
-    if ((cacheBlocks[block_index].addr == aligned_addr) &&
-        (cacheBlocks[block_index].valid)) {
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
-        stats.readHits++;
-        assert(!cacheBlocks[block_index].pendingData);
-        // No cache block could be in pendingApply and pendingWB at the
-        // same time.
-        assert(!(cacheBlocks[block_index].pendingApply &&
-                cacheBlocks[block_index].pendingWB));
-        // Hit
-        // TODO: Add a hit latency as a param for this object.
-        // Can't just schedule the nextResponseEvent for latency cycles in
-        // the future.
-        responseQueue.push_back(std::make_tuple(
-            addr, cacheBlocks[block_index].items[wl_offset], curTick()));
-
-        DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                "to responseQueue. responseQueue.size = %d.\n",
-                __func__, addr,
-                graphWorkload->printWorkListItem(
-                        cacheBlocks[block_index].items[wl_offset]),
-                responseQueue.size());
-        DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                "to responseQueue. responseQueue.size = %d.\n",
-                __func__, addr,
-                graphWorkload->printWorkListItem(
-                    cacheBlocks[block_index].items[wl_offset]),
-                responseQueue.size());
-        // TODO: Stat to count the number of WLItems that have been touched.
-        cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-        // If they are scheduled for apply and WB those schedules should be
-        // discarded. Since there is no easy way to take items out of the
-        // function queue. Those functions check for their respective bits
-        // and skip the process if the respective bit is set to false.
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].pendingWB = false;
-        // HACK: If a read happens on the same cycle as another operation such
-        // as apply set lastChangedTick to half a cycle later so that operation
-        // scheduled by the original operation (apply in this example) are
-        // invalidated. For more details refer to "accl/graph/sega/busyMaskErr"
-        cacheBlocks[block_index].lastChangedTick =
-                                    curTick() + (Tick) (clockPeriod() / 2);
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                    block_index, cacheBlocks[block_index].to_string());
-
-        if (!nextResponseEvent.scheduled()) {
-            schedule(nextResponseEvent, nextCycle());
-        }
-        stats.numVertexReads++;
-        return true;
-    } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
-                (cacheBlocks[block_index].pendingData)) {
-        // Hit under miss
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit under miss.\n",
-                                                        __func__, addr);
-        stats.readHitUnderMisses++;
-        assert(!cacheBlocks[block_index].valid);
-        assert(cacheBlocks[block_index].busyMask == 0);
-        assert(!cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(!cacheBlocks[block_index].pendingWB);
-
-        assert(MSHR.size() <= numMSHREntries);
-        assert(MSHR.find(block_index) != MSHR.end());
-        assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-        if (MSHR[block_index].size() == numTgtsPerMSHR) {
-            DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-                        "cacheBlocks[%d]. Rejecting request.\n",
-                                        __func__, block_index);
-            stats.mshrTargetShortage++;
-            return false;
-        } else {
-            DPRINTF(CoalesceEngine,  "%s: MSHR entries are available for "
-                            "cacheBlocks[%d].\n", __func__, block_index);
-        }
-        MSHR[block_index].push_back(addr);
-        stats.mshrEntryLength.sample(MSHR[block_index].size());
-        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
-                "for cacheBlocks[%d].\n", __func__, addr, block_index);
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                    block_index, cacheBlocks[block_index].to_string());
-        stats.numVertexReads++;
-        return true;
-    } else {
-        // miss
-        // FIXME: Make this assert work. It will break if the cache block
-        // is cold and addr or aligned_addr is 0. It fails because cache block
-        // addr field is initialized to 0. Unfortunately Addr type is unsigned.
-        // So you can not initialized addr to -1.
-        assert(cacheBlocks[block_index].addr != aligned_addr);
-        assert(MSHR.size() <= numMSHREntries);
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
-        if (MSHR.find(block_index) == MSHR.end()) {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
-                    " %lu not found in MSHRs.\n", __func__, block_index, addr);
-            if (MSHR.size() == numMSHREntries) {
-                // Out of MSHR entries
-                DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
-                                "Rejecting request.\n", __func__);
-                // TODO: Break out read rejections into more than one stat
-                // based on the cause of the rejection
-                stats.mshrEntryShortage++;
-                return false;
-            } else {
-                DPRINTF(CoalesceEngine,  "%s: MSHR "
-                    "entries available.\n", __func__);
-                if ((cacheBlocks[block_index].valid) ||
-                    (cacheBlocks[block_index].pendingData)) {
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
-                                "with Addr: %lu.\n", __func__, addr,
-                                cacheBlocks[block_index].addr);
-                    if ((cacheBlocks[block_index].valid) &&
-                        (cacheBlocks[block_index].busyMask == 0) &&
-                        (!cacheBlocks[block_index].pendingApply) &&
-                        (!cacheBlocks[block_index].pendingWB)) {
-                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                                    "idle state.\n", __func__, block_index);
-                        // We're in idle state
-                        // Idle: valid && !pendingApply && !pendingWB;
-                        // Note 0: needsApply has to be false. Because
-                        // A cache line enters the idle state from two
-                        // other states. First a busy state that does not
-                        // need apply (needsApply is already false) or
-                        // from pendingApplyState after being applied which
-                        // clears the needsApply bit. needsApply is useful
-                        // when a cache block has transitioned from
-                        // pendingApply to busy without the apply happening.
-                        // Note 1: pendingData does not have to be evaluated
-                        // becuase pendingData is cleared when data
-                        // arrives from the memory and valid does not
-                        // denote cleanliness of the line. Rather it
-                        // is used to differentiate between empty blocks
-                        // and the blocks that have data from memory.
-                        // pendingData denotes the transient state between
-                        // getting a miss and getting the data for that miss.
-                        // valid basically means that the data in the cache
-                        // could be used to respond to read/write requests.
-                        assert(!cacheBlocks[block_index].needsApply);
-                        assert(!cacheBlocks[block_index].pendingData);
-                        // There are no conflicts in idle state.
-                        assert(MSHR.find(block_index) == MSHR.end());
-                        if (cacheBlocks[block_index].needsWB) {
-                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs"
-                            "to be written back.\n", __func__, block_index);
-                            cacheBlocks[block_index].pendingWB = true;
-                            cacheBlocks[block_index].lastChangedTick = curTick();
-                            memoryFunctionQueue.emplace_back(
-                                [this] (int block_index, Tick schedule_tick) {
-                                processNextWriteBack(block_index, schedule_tick);
-                            }, block_index, curTick());
-                            DPRINTF(CoalesceEngine, "%s: Pushed "
-                                        "processNextWriteBack for input "
-                                        "%d to memoryFunctionQueue.\n",
-                                        __func__, block_index);
-                            if ((!nextMemoryEvent.pending()) &&
-                                (!nextMemoryEvent.scheduled())) {
-                                schedule(nextMemoryEvent, nextCycle());
-                            }
-                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
-                                    "%s.\n", __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                        } else {
-                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does "
-                                            "not need to be written back.\n",
-                                                        __func__, block_index);
-                            cacheBlocks[block_index].addr = aligned_addr;
-                            cacheBlocks[block_index].valid = false;
-                            cacheBlocks[block_index].busyMask = 0;
-                            cacheBlocks[block_index].needsWB = false;
-                            cacheBlocks[block_index].needsApply = false;
-                            cacheBlocks[block_index].pendingData = true;
-                            cacheBlocks[block_index].pendingApply = false;
-                            cacheBlocks[block_index].pendingWB = false;
-                            cacheBlocks[block_index].lastChangedTick = curTick();
-                            memoryFunctionQueue.emplace_back(
-                                [this] (int block_index, Tick schedule_tick) {
-                                    processNextRead(block_index, schedule_tick);
-                                }, block_index, curTick());
-                            DPRINTF(CoalesceEngine, "%s: Pushed "
-                                        "processNextRead for input "
-                                        "%d to memoryFunctionQueue.\n",
-                                        __func__, block_index);
-                            if ((!nextMemoryEvent.pending()) &&
-                                (!nextMemoryEvent.scheduled())) {
-                                schedule(nextMemoryEvent, nextCycle());
-                            }
-                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
-                                    "%s.\n", __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                        }
-                    }
-                    // cacheBlocks[block_index].hasConflict = true;
-                    MSHR[block_index].push_back(addr);
-                    stats.mshrEntryLength.sample(MSHR[block_index].size());
-                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
-                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
-                    stats.readMisses++;
-                    // TODO: Add readConflicts here.
-                    stats.numVertexReads++;
-                    return true;
-                } else {
-                    // MSHR available and no conflict
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
-                                            "Allocating a cache line for it.\n"
-                                                            , __func__, addr);
-                    assert(!cacheBlocks[block_index].valid);
-                    assert(cacheBlocks[block_index].busyMask == 0);
-                    assert(!cacheBlocks[block_index].needsWB);
-                    assert(!cacheBlocks[block_index].needsApply);
-                    assert(!cacheBlocks[block_index].pendingData);
-                    assert(!cacheBlocks[block_index].pendingApply);
-                    assert(!cacheBlocks[block_index].pendingWB);
-                    assert(MSHR[block_index].size() == 0);
-
-                    cacheBlocks[block_index].addr = aligned_addr;
-                    cacheBlocks[block_index].busyMask = 0;
-                    cacheBlocks[block_index].valid = false;
-                    cacheBlocks[block_index].needsWB = false;
-                    cacheBlocks[block_index].needsApply = false;
-                    cacheBlocks[block_index].pendingData = true;
-                    cacheBlocks[block_index].pendingApply = false;
-                    cacheBlocks[block_index].pendingWB = false;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
-                                " Addr: %lu.\n", __func__, block_index, addr);
-                    MSHR[block_index].push_back(addr);
-                    stats.mshrEntryLength.sample(MSHR[block_index].size());
-                    DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
-                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
-                    memoryFunctionQueue.emplace_back(
-                        [this] (int block_index, Tick schedule_tick) {
-                            processNextRead(block_index, schedule_tick);
-                        }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
-                                        "input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-                                    __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                    stats.readMisses++;
-                    stats.numVertexReads++;
-                    return true;
-                }
-            }
-        } else {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
-                "Addr: %lu already in MSHRs. It has a conflict "
-                "with addr: %lu.\n", __func__, block_index, addr,
-                                cacheBlocks[block_index].addr);
-            assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-            assert(MSHR[block_index].size() > 0);
-            if (MSHR[block_index].size() == numTgtsPerMSHR) {
-                DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-                            "cacheBlocks[%d]. Rejecting request.\n",
-                                            __func__, block_index);
-                stats.mshrTargetShortage++;
-                return false;
-            }
-            DPRINTF(CoalesceEngine, "%s: There is room for another target "
-                            "for cacheBlocks[%d].\n", __func__, block_index);
-
-            // TODO: Might want to differentiate between different misses.
-            stats.readMisses++;
-
-            MSHR[block_index].push_back(addr);
-            stats.mshrEntryLength.sample(MSHR[block_index].size());
-            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
-                            "cacheBlocks[%d].\n", __func__, addr, block_index);
-            stats.numVertexReads++;
-            return true;
-        }
-    }
-}
-
-bool
-CoalesceEngine::handleMemResp(PacketPtr pkt)
-{
-    assert(pkt->isResponse());
-    DPRINTF(CoalesceEngine,  "%s: Received packet: %s from memory.\n",
-                                                __func__, pkt->print());
-    if (pkt->isWrite()) {
-        DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__);
-        delete pkt;
-        return true;
-    }
-
-    onTheFlyReqs--;
-    Addr addr = pkt->getAddr();
-    int block_index = getBlockIndex(addr);
-    WorkListItem* items = pkt->getPtr<WorkListItem>();
-
-    bool do_wb = false;
-    if (pkt->findNextSenderState<SenderState>()) {
-        assert(!((cacheBlocks[block_index].addr == addr) &&
-                (cacheBlocks[block_index].valid)));
-        // We have read the address to send the wl and it is not in the
-        // cache. Simply send the items to the PushEngine.
-
-        DPRINTF(CoalesceEngine, "%s: Received read response for pull read "
-                                "for addr %lu.\n", __func__, addr);
-        int it = getBitIndexBase(addr);
-        uint64_t send_mask = pendingVertexPullReads[addr];
-        // No applying of the line needed.
-        for (int i = 0; i < numElementsPerLine; i++) {
-            Addr vertex_addr = addr + i * sizeof(WorkListItem);
-            uint64_t vertex_send_mask = send_mask & (1 << i);
-            if (vertex_send_mask != 0) {
-                assert(needsPush[it + i] == 1);
-                needsPush[it + i] = 0;
-                _workCount--;
-
-                uint32_t delta;
-                bool do_push, do_wb_v;
-                std::tie(delta, do_push, do_wb_v) =
-                                        graphWorkload->prePushApply(items[i]);
-                do_wb |= do_wb_v;
-                if (do_push) {
-                    owner->recvVertexPush(vertex_addr, delta,
-                                        items[i].edgeIndex, items[i].degree);
-                } else {
-                    // TODO: Add a stat to count this.
-                    owner->recvPrevPullCorrection();
-                }
-                stats.verticesPushed++;
-                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
-            }
-        }
-        pendingVertexPullReads.erase(addr);
-        maxPotentialPostPushWB--;
-    }
-
-    bool cache_wb = false;
-    if (cacheBlocks[block_index].addr == addr) {
-        DPRINTF(CoalesceEngine, "%s: Received read response to "
-                        "fill cacheBlocks[%d].\n", __func__, block_index);
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-        assert(!cacheBlocks[block_index].valid);
-        assert(cacheBlocks[block_index].busyMask == 0);
-        assert(!cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(cacheBlocks[block_index].pendingData);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(!cacheBlocks[block_index].pendingWB);
-        assert(MSHR.find(block_index) != MSHR.end());
-        std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize);
-        for (int i = 0; i < numElementsPerLine; i++) {
-            DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
-                __func__, block_index, i, graphWorkload->printWorkListItem(
-                                        cacheBlocks[block_index].items[i]));
-        }
-        cacheBlocks[block_index].valid = true;
-        cacheBlocks[block_index].needsWB |= do_wb;
-        cacheBlocks[block_index].pendingData = false;
-        // HACK: In case processNextRead is called on the same tick as curTick
-        // and is scheduled to read to the same cacheBlocks[block_index]
-        cacheBlocks[block_index].lastChangedTick =
-                                        curTick() - (Tick) (clockPeriod() / 2);
-        cache_wb = true;
-    } else if (do_wb) {
-        PacketPtr wb_pkt = createWritePacket(
-                                addr, peerMemoryAtomSize, (uint8_t*) items);
-        postPushWBQueue.emplace_back(wb_pkt, curTick());
-        memoryFunctionQueue.emplace_back(
-            [this] (int ignore, Tick schedule_tick) {
-                processNextPostPushWB(ignore, schedule_tick);
-            }, 0, curTick());
-        if ((!nextMemoryEvent.pending()) &&
-            (!nextMemoryEvent.scheduled())) {
-            schedule(nextMemoryEvent, nextCycle());
-        }
-    } else {
-        // TODO: Add a stat to count this.
-        // FIXME: This is not a totally wasteful read. e.g. all reads
-        // for pull in BFS are like this.
-        DPRINTF(CoalesceEngine, "%s: No write destination for addr: %lu.\n", __func__, addr);
-    }
-
-    if (cache_wb) {
-        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
-            Addr miss_addr = *it;
-            Addr aligned_miss_addr =
-                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-
-            if (aligned_miss_addr == addr) {
-                int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-                DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
-                            "cacheBlocks[%d] can be serviced with the received "
-                            "packet.\n",__func__, miss_addr, block_index);
-                // TODO: Make this block of code into a function
-                responseQueue.push_back(std::make_tuple(miss_addr,
-                        cacheBlocks[block_index].items[wl_offset], curTick()));
-                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                            "to responseQueue. responseQueue.size = %d.\n",
-                            __func__, miss_addr,
-                            graphWorkload->printWorkListItem(
-                                cacheBlocks[block_index].items[wl_offset]),
-                            responseQueue.size());
-                DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                            "to responseQueue. responseQueue.size = %d.\n",
-                            __func__, addr,
-                            graphWorkload->printWorkListItem(
-                                cacheBlocks[block_index].items[wl_offset]),
-                            responseQueue.size());
-                // TODO: Add a stat to count the number of WLItems that have been touched.
-                cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-                // cacheBlocks[block_index].lastChangedTick = curTick();
-                DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                            block_index, cacheBlocks[block_index].to_string());
-                it = MSHR[block_index].erase(it);
-            } else {
-                it++;
-            }
-        }
-    }
-
-    if (MSHR[block_index].empty()) {
-        MSHR.erase(block_index);
-    }
-
-    if ((!nextResponseEvent.scheduled()) &&
-        (!responseQueue.empty())) {
-        schedule(nextResponseEvent, nextCycle());
-    }
-
-
-    // TODO: Probably check for done here too.
-    delete pkt;
-    return true;
-}
-
-// TODO: For loop to empty the entire responseQueue.
-void
-CoalesceEngine::processNextResponseEvent()
-{
-    int num_responses_sent = 0;
-
-    Addr addr_response;
-    WorkListItem worklist_response;
-    Tick response_queueing_tick;
-    while(true) {
-        std::tie(addr_response, worklist_response, response_queueing_tick) =
-                                                        responseQueue.front();
-        Tick waiting_ticks = curTick() - response_queueing_tick;
-        if (ticksToCycles(waiting_ticks) < 1) {
-            break;
-        }
-        owner->handleIncomingWL(addr_response, worklist_response);
-        num_responses_sent++;
-        DPRINTF(CoalesceEngine,
-                    "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
-                    __func__,
-                    graphWorkload->printWorkListItem(worklist_response),
-                    addr_response);
-
-        responseQueue.pop_front();
-        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
-                    "responseQueue.size = %d.\n", __func__,
-                    responseQueue.size());
-        DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
-                    "responseQueue.size = %d.\n", __func__,
-                    responseQueue.size());
-        stats.responseQueueLatency.sample(
-                                    waiting_ticks * 1e9 / getClockFrequency());
-        if (num_responses_sent >= maxRespPerCycle) {
-            if (!responseQueue.empty()) {
-                stats.responsePortShortage++;
-            }
-            break;
-        }
-        if (responseQueue.empty()) {
-            break;
-        }
-    }
-
-    if ((!nextResponseEvent.scheduled()) &&
-        (!responseQueue.empty())) {
-        schedule(nextResponseEvent, nextCycle());
-    }
-}
-
-void
-CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
-{
-    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
-    int block_index = getBlockIndex(aligned_addr);
-    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
-    DPRINTF(CoalesceEngine,  "%s: Received a write request for addr: %lu with "
-                        "wl: %s. This request maps to cacheBlocks[%d], "
-                        "aligned_addr: %lu, and wl_offset: %d.\n",
-                        __func__, addr, graphWorkload->printWorkListItem(wl),
-                        block_index, aligned_addr, wl_offset);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                block_index, cacheBlocks[block_index].to_string());
-    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
-                "with Addr: %lu.\n", __func__,
-                graphWorkload->printWorkListItem(wl), addr);
-    // Desing does not allow for write misses for now.
-    assert(cacheBlocks[block_index].addr == aligned_addr);
-    // cache state asserts
-    assert(cacheBlocks[block_index].valid);
-    assert(cacheBlocks[block_index].busyMask != 0);
-    assert(!cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingApply);
-    assert(!cacheBlocks[block_index].pendingWB);
-
-    // respective bit in busyMask for wl is set.
-    assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
-            (1 << wl_offset));
-
-    if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
-        cacheBlocks[block_index].needsWB |= true;
-        stats.numVertexWrites++;
-    }
-    cacheBlocks[block_index].items[wl_offset] = wl;
-    if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) {
-        cacheBlocks[block_index].needsApply |= true;
-        cacheBlocks[block_index].needsWB |= true;
-    }
-
-    cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
-    cacheBlocks[block_index].lastChangedTick = curTick();
-    DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
-                __func__, block_index, wl_offset,
-                graphWorkload->printWorkListItem(
-                    cacheBlocks[block_index].items[wl_offset]));
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-
-    // TODO: Make this more general and programmable.
-    if ((cacheBlocks[block_index].busyMask == 0)) {
-        if (cacheBlocks[block_index].needsApply) {
-            cacheBlocks[block_index].pendingApply = true;
-            cacheBlocks[block_index].lastChangedTick = curTick();
-            applyQueue.push_back(block_index);
-            DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
-                            "applyQueue.\n", __func__, block_index);
-            if ((!applyQueue.empty()) &&
-                (!nextPreWBApplyEvent.scheduled())) {
-                schedule(nextPreWBApplyEvent, nextCycle());
-            }
-        } else {
-            assert(MSHR.size() <= numMSHREntries);
-            // cache line has conflict.
-            if (MSHR.find(block_index) != MSHR.end()) {
-                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
-                                    "conflict.\n", __func__, block_index);
-                if (cacheBlocks[block_index].needsWB) {
-                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write"
-                                            " back.\n", __func__, block_index);
-                    cacheBlocks[block_index].pendingWB = true;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    memoryFunctionQueue.emplace_back(
-                        [this] (int block_index, Tick schedule_tick) {
-                            processNextWriteBack(block_index, schedule_tick);
-                        }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack "
-                                    "for input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                } else {
-                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does not need"
-                                    " a write back.\n", __func__, block_index);
-                    Addr miss_addr = MSHR[block_index].front();
-                    Addr aligned_miss_addr =
-                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-                    DPRINTF(CoalesceEngine, "%s: First conflicting address for"
-                        " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
-                        __func__, block_index, miss_addr, aligned_miss_addr);
-                    cacheBlocks[block_index].addr = aligned_miss_addr;
-                    cacheBlocks[block_index].valid = false;
-                    cacheBlocks[block_index].busyMask = 0;
-                    cacheBlocks[block_index].needsWB = false;
-                    cacheBlocks[block_index].needsApply = false;
-                    cacheBlocks[block_index].pendingData = true;
-                    cacheBlocks[block_index].pendingApply = false;
-                    cacheBlocks[block_index].pendingWB = false;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    memoryFunctionQueue.emplace_back(
-                        [this] (int block_index, Tick schedule_tick) {
-                            processNextRead(block_index, schedule_tick);
-                        }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead "
-                                    "for input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                }
-            } else {
-                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                        "idle state now.\n", __func__, block_index);
-            }
-        }
-    }
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                block_index, cacheBlocks[block_index].to_string());
-
-}
-
-void
-CoalesceEngine::processNextPreWBApplyEvent()
-{
-    int block_index = applyQueue.front();
-    DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
-                "cacheBlock[%d] to be applied.\n", __func__, block_index);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-            __func__, block_index, cacheBlocks[block_index].to_string());
-    assert(cacheBlocks[block_index].valid);
-    assert(cacheBlocks[block_index].needsApply);
-    assert(!cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingWB);
-
-    if (cacheBlocks[block_index].pendingApply) {
-        assert(cacheBlocks[block_index].busyMask == 0);
-        for (int index = 0; index < numElementsPerLine; index++) {
-            bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
-            if (do_push) {
-                int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
-                if (needsPush[bit_index_base + index] == 0) {
-                    needsPush[bit_index_base + index] = 1;
-                    _workCount++;
-                    activeBits.push_back(bit_index_base + index);
-                    if (!owner->running()) {
-                        owner->start();
-                    }
-                }
-            }
-        }
-        stats.bitvectorLength.sample(needsPush.count());
-
-        assert(cacheBlocks[block_index].needsWB);
-        cacheBlocks[block_index].needsApply = false;
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
-
-        assert(MSHR.size() <= numMSHREntries);
-        if (MSHR.find(block_index) != MSHR.end()) {
-            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
-                                "conflicts.\n", __func__, block_index);
-            cacheBlocks[block_index].pendingWB = true;
-            cacheBlocks[block_index].lastChangedTick = curTick();
-            memoryFunctionQueue.emplace_back(
-                [this] (int block_index, Tick schedule_tick) {
-                processNextWriteBack(block_index, schedule_tick);
-            }, block_index, curTick());
-            DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input"
-                    " %d to memoryFunctionQueue.\n", __func__, block_index);
-            if ((!nextMemoryEvent.pending()) &&
-                (!nextMemoryEvent.scheduled())) {
-                schedule(nextMemoryEvent, nextCycle());
-            }
-        } else {
-            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                    "idle state now.\n", __func__, block_index);
-        }
-        DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__,
-                    block_index, cacheBlocks[block_index].to_string());
-    } else {
-        stats.numInvalidApplies++;
-    }
-
-    applyQueue.pop_front();
-    if ((!applyQueue.empty()) &&
-        (!nextPreWBApplyEvent.scheduled())) {
-        schedule(nextPreWBApplyEvent, nextCycle());
-    }
-
-    if (done()) {
-        owner->recvDoneSignal();
-    }
-}
-
-void
-CoalesceEngine::processNextMemoryEvent()
-{
-    if (memPort.blocked()) {
-        stats.numMemoryBlocks++;
-        nextMemoryEvent.sleep();
-        return;
-    }
-
-    DPRINTF(CoalesceEngine, "%s: Processing another "
-                        "memory function.\n", __func__);
-    std::function<void(int, Tick)> next_memory_function;
-    int next_memory_function_input;
-    Tick next_memory_function_tick;
-    std::tie(
-        next_memory_function,
-        next_memory_function_input,
-        next_memory_function_tick) = memoryFunctionQueue.front();
-    next_memory_function(next_memory_function_input, next_memory_function_tick);
-    memoryFunctionQueue.pop_front();
-    stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick)
-                                                * 1e9 / getClockFrequency());
-    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
-                                "memoryFunctionQueue.size = %d.\n", __func__,
-                                memoryFunctionQueue.size());
-
-    assert(!nextMemoryEvent.pending());
-    assert(!nextMemoryEvent.scheduled());
-    if ((!memoryFunctionQueue.empty())) {
-        schedule(nextMemoryEvent, nextCycle());
-    }
-}
-
-void
-CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
-{
-    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n",
-                                            __func__, block_index);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-        __func__, block_index, cacheBlocks[block_index].to_string());
-    // A cache block should not be touched while it's waiting for data.
-    // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
-
-    if (cacheBlocks[block_index].lastChangedTick != schedule_tick) {
-        return;
-    }
-
-    assert(!cacheBlocks[block_index].valid);
-    assert(cacheBlocks[block_index].busyMask == 0);
-    assert(!cacheBlocks[block_index].needsWB);
-    assert(!cacheBlocks[block_index].needsApply);
-    assert(cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingApply);
-    assert(!cacheBlocks[block_index].pendingWB);
-
-    bool need_send_pkt = true;
-    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
-    {
-        PacketPtr wb_pkt = std::get<0>(*wb);
-        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
-            wb_pkt->writeDataToBlock(
-                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
-            cacheBlocks[block_index].needsWB = true;
-            for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
-                Addr miss_addr = *it;
-                Addr aligned_miss_addr =
-                    roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-
-                if (aligned_miss_addr == cacheBlocks[block_index].addr) {
-                    int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
-                                "cacheBlocks[%d] can be serviced with the received "
-                                "packet.\n",__func__, miss_addr, block_index);
-                    // TODO: Make this block of code into a function
-                    responseQueue.push_back(std::make_tuple(miss_addr,
-                            cacheBlocks[block_index].items[wl_offset], curTick()));
-                    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                                "to responseQueue. responseQueue.size = %d.\n",
-                                __func__, miss_addr,
-                                graphWorkload->printWorkListItem(
-                                    cacheBlocks[block_index].items[wl_offset]),
-                                responseQueue.size());
-                    DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                                "to responseQueue. responseQueue.size = %d.\n",
-                                __func__, miss_addr,
-                                graphWorkload->printWorkListItem(
-                                    cacheBlocks[block_index].items[wl_offset]),
-                                responseQueue.size());
-                    // TODO: Add a stat to count the number of WLItems that have been touched.
-                    cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                                block_index, cacheBlocks[block_index].to_string());
-                    it = MSHR[block_index].erase(it);
-                } else {
-                    it++;
-                }
-            }
-            if (MSHR[block_index].empty()) {
-                MSHR.erase(block_index);
-            }
-
-            if ((!nextResponseEvent.scheduled()) &&
-                (!responseQueue.empty())) {
-                schedule(nextResponseEvent, nextCycle());
-            }
-            postPushWBQueue.erase(wb);
-            need_send_pkt = false;
-        }
-    }
-
-    if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) !=
-        pendingVertexPullReads.end()) {
-        need_send_pkt = false;
-    }
-
-    if (need_send_pkt) {
-        PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
-                                        peerMemoryAtomSize);
-        DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
-                "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
-        memPort.sendPacket(pkt);
-        onTheFlyReqs++;
-
-        if (pendingVertexPullReads.find(pkt->getAddr()) !=
-            pendingVertexPullReads.end()) {
-            stats.numDoubleMemReads++;
-        }
-    }
-}
-
-void
-CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
-{
-    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n",
-                                                __func__, block_index);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                block_index, cacheBlocks[block_index].to_string());
-    if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
-        assert(cacheBlocks[block_index].valid);
-        assert(cacheBlocks[block_index].busyMask == 0);
-        assert(cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(!cacheBlocks[block_index].pendingData);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(cacheBlocks[block_index].pendingWB);
-
-        // Why would we write it back if it does not have a conflict.
-        assert(MSHR.size() <= numMSHREntries);
-        assert(MSHR.find(block_index) != MSHR.end());
-
-        PacketPtr pkt = createWritePacket(
-                cacheBlocks[block_index].addr, peerMemoryAtomSize,
-                (uint8_t*) cacheBlocks[block_index].items);
-        DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
-                        "Addr: %lu, size = %d.\n", __func__,
-                        pkt->getAddr(), pkt->getSize());
-        memPort.sendPacket(pkt);
-        // onTheFlyReqs++;
-        cacheBlocks[block_index].needsWB = false;
-        cacheBlocks[block_index].pendingWB = false;
-
-        Addr miss_addr = MSHR[block_index].front();
-        Addr aligned_miss_addr =
-                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-        DPRINTF(CoalesceEngine, "%s: First conflicting address for"
-                    " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
-                    __func__, block_index, miss_addr, aligned_miss_addr);
-
-        cacheBlocks[block_index].addr = aligned_miss_addr;
-        cacheBlocks[block_index].valid = false;
-        cacheBlocks[block_index].busyMask = 0;
-        cacheBlocks[block_index].needsWB = false;
-        cacheBlocks[block_index].needsApply = false;
-        cacheBlocks[block_index].pendingData = true;
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].pendingWB = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
-        memoryFunctionQueue.emplace_back(
-            [this] (int block_index, Tick schedule_tick) {
-            processNextRead(block_index, schedule_tick);
-        }, block_index, curTick());
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
-                " %d to memoryFunctionQueue.\n", __func__, block_index);
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                    block_index, cacheBlocks[block_index].to_string());
-    } else {
-        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a "
-                            "write back has been scheduled for it. Ignoring "
-                            "the current write back scheduled at tick %lu for "
-                            "the right function scheduled later.\n",
-                            __func__, block_index, schedule_tick);
-        stats.numInvalidWriteBacks++;
-    }
-}
-
-void
-CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
-{
-    PacketPtr wb_pkt;
-    Tick pkt_tick;
-    std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
-    if (schedule_tick == pkt_tick) {
-        memPort.sendPacket(wb_pkt);
-        postPushWBQueue.pop_front();
-    }
-}
-
-std::tuple<WorkLocation, Addr, int>
-CoalesceEngine::getOptimalPullAddr()
-{
-    int visited_bits = 0;
-    int num_intial_active_bits = activeBits.size();
-    while (visited_bits < num_intial_active_bits) {
-        int index = activeBits.front();
-        int base_index = roundDown<int, int>(index, numElementsPerLine);
-        int index_offset = index - base_index;
-        assert(needsPush[index] == 1);
-        assert(index_offset < numElementsPerLine);
-
-        Addr addr = getBlockAddrFromBitIndex(base_index);
-        int block_index = getBlockIndex(addr);
-        if (pendingVertexPullReads.find(addr) != pendingVertexPullReads.end())
-        {
-            uint64_t send_mask = pendingVertexPullReads[addr];
-            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
-            assert(vertex_send_mask == 0);
-            activeBits.pop_front();
-            return std::make_tuple(
-                                WorkLocation::PENDING_READ, addr, index_offset);
-        } else {
-            // Only if it is in cache and it is in idle state.
-            if ((cacheBlocks[block_index].addr == addr) &&
-                (cacheBlocks[block_index].valid) &&
-                (cacheBlocks[block_index].busyMask == 0) &&
-                (!cacheBlocks[block_index].pendingApply) &&
-                (!cacheBlocks[block_index].pendingWB)) {
-                assert(!cacheBlocks[block_index].needsApply);
-                assert(!cacheBlocks[block_index].pendingData);
-                activeBits.pop_front();
-                return std::make_tuple(
-                            WorkLocation::IN_CACHE, block_index, index_offset);
-            // Otherwise if it is in memory
-            } else if ((cacheBlocks[block_index].addr != addr)) {
-                activeBits.pop_front();
-                return std::make_tuple(
-                            WorkLocation::IN_MEMORY, addr, index_offset);
-            }
-        }
-        activeBits.pop_front();
-        activeBits.push_back(index);
-        visited_bits++;
-    }
-
-    return std::make_tuple(WorkLocation::GARBAGE, 0, 0);
-}
-
-void
-CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
-{
-    WorkLocation bit_status;
-    Addr location;
-    int offset;
-
-    std::tie(bit_status, location, offset) = getOptimalPullAddr();
-
-    if (bit_status != WorkLocation::GARBAGE) {
-        if (bit_status == WorkLocation::PENDING_READ) {
-            // renaming the outputs to thier local names.
-            Addr addr = location;
-            int index_offset = offset;
-
-            uint64_t send_mask = pendingVertexPullReads[addr];
-            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
-            assert(vertex_send_mask == 0);
-            send_mask |= (1 << index_offset);
-            pendingVertexPullReads[addr] = send_mask;
-            numPullsReceived--;
-        }
-        if (bit_status == WorkLocation::IN_CACHE) {
-            // renaming the outputs to their local names.
-            int block_index = (int) location;
-            int wl_offset = offset;
-
-            Addr addr = cacheBlocks[block_index].addr;
-            Addr vertex_addr = addr + (wl_offset * sizeof(WorkListItem));
-            int slice_base_index = getBitIndexBase(addr);
-
-            needsPush[slice_base_index + wl_offset] = 0;
-            _workCount--;
-
-            uint32_t delta;
-            bool do_push, do_wb;
-            std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
-                                    cacheBlocks[block_index].items[wl_offset]);
-            cacheBlocks[block_index].needsWB |= do_wb;
-            if (do_push) {
-                owner->recvVertexPush(vertex_addr, delta,
-                        cacheBlocks[block_index].items[wl_offset].edgeIndex,
-                        cacheBlocks[block_index].items[wl_offset].degree);
-            } else {
-                DPRINTF(CoalesceEngine, "%s: Fuck!.\n", __func__);
-                owner->recvPrevPullCorrection();
-            }
-            stats.verticesPushed++;
-            stats.lastVertexPushTime = curTick() - stats.lastResetTick;
-            numPullsReceived--;
-        }
-        if (bit_status == WorkLocation::IN_MEMORY) {
-            if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) {
-                Addr addr = location;
-                int index_offset = offset;
-                uint64_t send_mask = (1 << index_offset);
-                assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
-                PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
-                SenderState* sender_state = new SenderState(true);
-                pkt->pushSenderState(sender_state);
-                memPort.sendPacket(pkt);
-                onTheFlyReqs++;
-                maxPotentialPostPushWB++;
-                pendingVertexPullReads[addr] = send_mask;
-                numPullsReceived--;
-            }
-        }
-    }
-
-    stats.bitvectorSearchStatus[bit_status]++;
-
-    if (numPullsReceived > 0) {
-        memoryFunctionQueue.emplace_back(
-            [this] (int slice_base, Tick schedule_tick) {
-            processNextVertexPull(slice_base, schedule_tick);
-        }, 0, curTick());
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
-                                    "0 to memoryFunctionQueue.\n", __func__);
-    }
-}
-
-void
-CoalesceEngine::recvMemRetry()
-{
-    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
-
-    if (!nextMemoryEvent.pending()) {
-        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
-        return;
-    }
-    assert(!nextMemoryEvent.scheduled());
-    nextMemoryEvent.wake();
-    schedule(nextMemoryEvent, nextCycle());
-}
-
-void
-CoalesceEngine::recvVertexPull()
-{
-    bool should_schedule = (numPullsReceived == 0);
-    numPullsReceived++;
-
-    stats.verticesPulled++;
-    stats.lastVertexPullTime = curTick() - stats.lastResetTick;
-    if (should_schedule) {
-        memoryFunctionQueue.emplace_back(
-            [this] (int slice_base, Tick schedule_tick) {
-            processNextVertexPull(slice_base, schedule_tick);
-        }, 0, curTick());
-        if ((!nextMemoryEvent.pending()) &&
-            (!nextMemoryEvent.scheduled())) {
-            schedule(nextMemoryEvent, nextCycle());
-        }
-    }
-}
-
-CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
-    : statistics::Group(&_coalesce),
-    coalesce(_coalesce),
-    lastResetTick(0),
-    ADD_STAT(numVertexReads, statistics::units::Count::get(),
-             "Number of memory vertecies read from cache."),
-    ADD_STAT(numVertexWrites, statistics::units::Count::get(),
-             "Number of memory vertecies written to cache."),
-    ADD_STAT(readHits, statistics::units::Count::get(),
-             "Number of cache hits."),
-    ADD_STAT(readMisses, statistics::units::Count::get(),
-             "Number of cache misses."),
-    ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
-             "Number of cache hit under misses."),
-    ADD_STAT(mshrEntryShortage, statistics::units::Count::get(),
-             "Number of cache rejections caused by entry shortage."),
-    ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
-             "Number of cache rejections caused by target shortage."),
-    ADD_STAT(responsePortShortage, statistics::units::Count::get(),
-             "Number of times a response has been "
-             "delayed because of port shortage. "),
-    ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
-             "Number of times memory bandwidth was not available."),
-    ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
-             "Number of times a memory block has been read twice. "
-             "Once for push and once to populate the cache."),
-    ADD_STAT(verticesPulled, statistics::units::Count::get(),
-             "Number of times a pull request has been sent by PushEngine."),
-    ADD_STAT(verticesPushed, statistics::units::Count::get(),
-             "Number of times a vertex has been pushed to the PushEngine"),
-    ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(),
-             "Time of the last pull request. (Relative to reset_stats)"),
-    ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
-             "Time of the last vertex push. (Relative to reset_stats)"),
-    ADD_STAT(numInvalidApplies, statistics::units::Count::get(),
-             "Number of times a line has become busy"
-             " while waiting to be applied."),
-    ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(),
-             "Number of times a scheduled memory function has been invalid."),
-    ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
-             "Distribution for the location of vertex searches."),
-    ADD_STAT(hitRate, statistics::units::Ratio::get(),
-             "Hit rate in the cache."),
-    ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
-                                            statistics::units::Second>::get(),
-             "Rate at which pull requests arrive."),
-    ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
-                                            statistics::units::Second>::get(),
-             "Rate at which vertices are pushed."),
-    ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
-             "Histogram on the length of the mshr entries."),
-    ADD_STAT(bitvectorLength, statistics::units::Count::get(),
-             "Histogram of the length of the bitvector."),
-    ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
-             "Histogram of the response latency to WLEngine. (ns)"),
-    ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
-             "Histogram of the latency of processing a memory function.")
-{
-}
-
-void
-CoalesceEngine::CoalesceStats::regStats()
-{
-    using namespace statistics;
-
-    bitvectorSearchStatus.init(NUM_STATUS);
-    bitvectorSearchStatus.subname(0, "PENDING_READ");
-    bitvectorSearchStatus.subname(1, "IN_CACHE");
-    bitvectorSearchStatus.subname(2, "IN_MEMORY");
-    bitvectorSearchStatus.subname(3, "GARBAGE");
-
-    hitRate = (readHits + readHitUnderMisses) /
-                (readHits + readHitUnderMisses + readMisses);
-
-    vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime;
-
-    vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
-
-    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
-    bitvectorLength.init(64);
-    responseQueueLatency.init(64);
-    memoryFunctionLatency.init(64);
-}
-
-void
-CoalesceEngine::CoalesceStats::resetStats()
-{
-    statistics::Group::resetStats();
-
-    lastResetTick = curTick();
-}
-
-} // namespace gem5
diff --git a/src/accl/graph/sega/coalesce_engine_bak.hh b/src/accl/graph/sega/coalesce_engine_bak.hh
deleted file mode 100644
index 0787a334c1..0000000000
--- a/src/accl/graph/sega/coalesce_engine_bak.hh
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
-#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
-
-#include <bitset>
-
-#include "accl/graph/base/data_structs.hh"
-#include "accl/graph/base/graph_workload.hh"
-#include "accl/graph/sega/base_memory_engine.hh"
-#include "base/cprintf.hh"
-#include "base/statistics.hh"
-#include "params/CoalesceEngine.hh"
-
-
-
-namespace gem5
-{
-
-enum WorkLocation
-{
-    PENDING_READ,
-    IN_CACHE,
-    IN_MEMORY,
-    GARBAGE,
-    NUM_STATUS
-};
-
-class MPU;
-
-class CoalesceEngine : public BaseMemoryEngine
-{
-  private:
-    struct Block
-    {
-        WorkListItem* items;
-        Addr addr;
-        uint64_t busyMask;
-        bool valid;
-        bool needsApply;
-        bool needsWB;
-        bool pendingData;
-        bool pendingApply;
-        bool pendingWB;
-        Tick lastChangedTick;
-        // TODO: This might be useful in the future
-        // Tick lastWLWriteTick;
-        Block() {}
-        Block(int num_elements):
-          addr(-1),
-          busyMask(0),
-          valid(false),
-          needsApply(false),
-          needsWB(false),
-          pendingData(false),
-          pendingApply(false),
-          pendingWB(false),
-          lastChangedTick(0),
-        {
-          items = new WorkListItem [num_elements];
-        }
-
-        std::string to_string() {
-            return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
-                "needsApply: %s, needsWB: %s, pendingData: %s, "
-                "pendingApply: %s, pendingWB: %s, lastChangedTick: %lu}",
-                addr, busyMask, valid ? "true" : "false",
-                needsApply ? "true" : "false", needsWB ? "true" : "false",
-                pendingData ? "true" : "false", pendingApply ? "true" : "false",
-                pendingWB ? "true" : "false", lastChangedTick);
-        }
-    };
-
-    struct SenderState : public Packet::SenderState
-    {
-      bool isRetry;
-      SenderState(bool is_retry): isRetry(is_retry) {}
-    };
-    MPU* owner;
-    GraphWorkload* graphWorkload;
-
-    int numLines;
-    int numElementsPerLine;
-    Block* cacheBlocks;
-
-    int onTheFlyReqs;
-    int numMSHREntries;
-    int numTgtsPerMSHR;
-    std::unordered_map<int, std::vector<Addr>> MSHR;
-    int maxRespPerCycle;
-    std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
-
-    int _workCount;
-    int numPullsReceived;
-    UniqueFIFO<int> applyQueue;
-    std::bitset<MAX_BITVECTOR_SIZE> needsPush;
-    std::deque<int> activeBits;
-    int postPushWBQueueSize;
-    std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
-
-    int getBlockIndex(Addr addr);
-    int getBitIndexBase(Addr addr);
-    Addr getBlockAddrFromBitIndex(int index);
-    std::tuple<WorkLocation, Addr, int> getOptimalPullAddr();
-
-    int maxPotentialPostPushWB;
-    // A map from addr to sendMask. sendMask determines which bytes to
-    // send for push when getting the read response from memory.
-    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
-
-    MemoryEvent nextMemoryEvent;
-    void processNextMemoryEvent();
-    void processNextRead(int block_index, Tick schedule_tick);
-    void processNextWriteBack(int block_index, Tick schedule_tick);
-    void processNextVertexPull(int ignore, Tick schedule_tick);
-    void processNextPostPushWB(int ignore, Tick schedule_tick);
-    std::deque<std::tuple<
-        std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
-
-    EventFunctionWrapper nextResponseEvent;
-    void processNextResponseEvent();
-
-    EventFunctionWrapper nextPreWBApplyEvent;
-    void processNextPreWBApplyEvent();
-
-    struct CoalesceStats : public statistics::Group
-    {
-        CoalesceStats(CoalesceEngine &coalesce);
-
-        virtual void regStats() override;
-
-        virtual void resetStats() override;
-
-        CoalesceEngine &coalesce;
-
-        Tick lastResetTick;
-
-        statistics::Scalar numVertexReads;
-        statistics::Scalar numVertexWrites;
-        statistics::Scalar readHits;
-        statistics::Scalar readMisses;
-        statistics::Scalar readHitUnderMisses;
-        statistics::Scalar mshrEntryShortage;
-        statistics::Scalar mshrTargetShortage;
-        statistics::Scalar responsePortShortage;
-        statistics::Scalar numMemoryBlocks;
-        statistics::Scalar numDoubleMemReads;
-        statistics::Scalar verticesPulled;
-        statistics::Scalar verticesPushed;
-        statistics::Scalar lastVertexPullTime;
-        statistics::Scalar lastVertexPushTime;
-        statistics::Scalar numInvalidApplies;
-        statistics::Scalar numInvalidWriteBacks;
-
-        statistics::Vector bitvectorSearchStatus;
-
-        statistics::Formula hitRate;
-        statistics::Formula vertexPullBW;
-        statistics::Formula vertexPushBW;
-
-        statistics::Histogram mshrEntryLength;
-        statistics::Histogram bitvectorLength;
-        statistics::Histogram responseQueueLatency;
-        statistics::Histogram memoryFunctionLatency;
-    };
-
-    CoalesceStats stats;
-
-  protected:
-    virtual void recvMemRetry() override;
-    virtual bool handleMemResp(PacketPtr pkt) override;
-
-  public:
-    PARAMS(CoalesceEngine);
-    CoalesceEngine(const Params &params);
-    void registerMPU(MPU* mpu);
-
-    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
-    virtual void recvFunctional(PacketPtr pkt);
-
-    bool recvWLRead(Addr addr);
-    void recvWLWrite(Addr addr, WorkListItem wl);
-
-    int workCount() { return _workCount; }
-    void recvVertexPull();
-
-    bool done();
-};
-
-}
-
-#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
new file mode 100644
index 0000000000..8c9d223178
--- /dev/null
+++ b/src/accl/graph/sega/enums.cc
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/enums.hh"
+
+namespace gem5
+{
+
+const char* cacheStateStrings[NUM_CACHE_STATE] = {
+    "INVALID",
+    "PENDING_DATA",
+    "BUSY",
+    "IDLE",
+    "PENDING_WB",
+    "LOCKED_FOR_APPLY"
+};
+
+
+const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS] =
+{
+    "ACCEPT",
+    "REJECT_ROLL",
+    "REJECT_NO_ROLL"
+};
+
+const char* readDestinationStrings[NUM_READ_DESTINATION] =
+{
+    "READ_FOR_CACHE",
+    "READ_FOR_PUSH"
+};
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
new file mode 100644
index 0000000000..e7a8f84452
--- /dev/null
+++ b/src/accl/graph/sega/enums.hh
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_ENUMS_HH__
+#define __ACCL_GRAPH_SEGA_ENUMS_HH__
+
+namespace gem5
+{
+
+enum CacheState
+{
+    INVALID,
+    PENDING_DATA,
+    BUSY,
+    IDLE,
+    PENDING_WB,
+    LOCKED_FOR_APPLY,
+    NUM_CACHE_STATE
+};
+extern const char* cacheStateStrings[NUM_CACHE_STATE];
+
+enum ReadReturnStatus
+{
+    ACCEPT,
+    REJECT_ROLL,
+    REJECT_NO_ROLL,
+    NUM_READ_RETURN_STATUS
+};
+extern const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS];
+
+enum ReadDestination
+{
+    READ_FOR_CACHE,
+    READ_FOR_PUSH,
+    NUM_READ_DESTINATION
+};
+extern const char* readDestinationStrings[NUM_READ_DESTINATION];
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_ENUMS_HH__
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index b30060238d..f661bd68a6 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -87,12 +87,6 @@ MPU::recvVertexPush(Addr addr, uint32_t delta,
     pushEngine->recvVertexPush(addr, delta, edge_index, degree);
 }
 
-void
-MPU::recvPrevPullCorrection()
-{
-    pushEngine->recvPrevPullCorrection();
-}
-
 void
 MPU::recvDoneSignal()
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 8f3b29f603..ad18a0d5a5 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -34,6 +34,7 @@
 
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/coalesce_engine.hh"
+#include "accl/graph/sega/enums.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/addr_range.hh"
@@ -64,10 +65,12 @@ class MPU : public SimObject
 
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
+    void postMemInitSetup() { coalesceEngine->postMemInitSetup(); }
+
     bool handleIncomingUpdate(PacketPtr pkt);
 
     void handleIncomingWL(Addr addr, WorkListItem wl);
-    bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
+    ReadReturnStatus recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
     void recvWorkload(GraphWorkload* Workload);
 
@@ -77,7 +80,6 @@ class MPU : public SimObject
     void start() { return pushEngine->start(); }
     void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
-    void recvPrevPullCorrection();
 
     void recvDoneSignal();
     bool done();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 07f37a28dc..a17991e335 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -155,13 +155,13 @@ void
 PushEngine::start()
 {
     assert(!_running);
-    assert(!nextVertexPullEvent.scheduled());
+    // assert(!nextVertexPullEvent.scheduled());
 
     _running = true;
     stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick);
     // NOTE: We might have to check for size availability here.
     assert(workLeft());
-    if (vertexSpace()) {
+    if (vertexSpace() && !nextVertexPullEvent.scheduled()) {
         schedule(nextVertexPullEvent, nextCycle());
     }
 }
@@ -169,17 +169,16 @@ PushEngine::start()
 void
 PushEngine::processNextVertexPullEvent()
 {
-    // TODO: change edgePointerQueueSize
-    numPendingPulls++;
-    owner->recvVertexPull();
-
-    if (!workLeft()) {
+    if (workLeft()) {
+        numPendingPulls++;
+        owner->recvVertexPull();
+        if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+            schedule(nextVertexPullEvent, nextCycle());
+        }
+    } else {
         _running = false;
         lastIdleEntranceTick = curTick();
-    }
-
-    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
-        schedule(nextVertexPullEvent, nextCycle());
+        DPRINTF(PushEngine, "%s: In idle state now.\n", __func__);
     }
 }
 
@@ -197,9 +196,9 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta,
                             sizeof(Edge), peerMemoryAtomSize);
 
     edgePointerQueue.emplace_back(info_gen, curTick());
-
     numPendingPulls--;
-    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+
+    if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
     }
 
@@ -209,16 +208,6 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta,
     }
 }
 
-void
-PushEngine::recvPrevPullCorrection()
-{
-    assert(numPendingPulls > 0);
-    numPendingPulls--;
-    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
-        schedule(nextVertexPullEvent, nextCycle());
-    }
-}
-
 void
 PushEngine::processNextMemoryReadEvent()
 {
@@ -255,7 +244,7 @@ PushEngine::processNextMemoryReadEvent()
         }
     }
 
-    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+    if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
     }
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 2e1de25390..08cceb14f0 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -32,6 +32,7 @@
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/sega/base_memory_engine.hh"
+#include "accl/graph/sega/enums.hh"
 #include "base/intmath.hh"
 #include "params/PushEngine.hh"
 
@@ -199,7 +200,6 @@ class PushEngine : public BaseMemoryEngine
     bool running() { return _running; }
     void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
-    void recvPrevPullCorrection();
 
     void recvReqRetry();
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index a698f2cc0a..2b305e1557 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -134,7 +134,7 @@ WLEngine::RespPort::recvRespRetry()
 void
 WLEngine::checkRetryReq()
 {
-    for (int i = 0; i < inPorts.size(); ++i) {
+    for (int i = 0; i < inPorts.size(); i++) {
         inPorts[i].checkRetryReq();
     }
 }
@@ -191,12 +191,8 @@ WLEngine::processNextReadEvent()
         if (registerFile.size() < registerFileSize) {
             DPRINTF(WLEngine, "%s: There are free registers available in the "
                                             "registerFile.\n", __func__);
-            // TODO: It might be a good idea for WLEngine to act differently
-            // on cache rejects. As a first step the cache should not just
-            // return a boolean value. It should return an integer/enum
-            // to tell WLEngine why it rejected the read request. Their might
-            // be things that WLEngine can do to fix head of the line blocking.
-            if (owner->recvWLRead(update_addr)) {
+            ReadReturnStatus read_status = owner->recvWLRead(update_addr);
+            if (read_status == ReadReturnStatus::ACCEPT) {
                 DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read "
                             "request to addr: %lu.\n", __func__, update_addr);
                 registerFile[update_addr] = update_value;
@@ -209,7 +205,8 @@ WLEngine::processNextReadEvent()
                         "registerFileSize = %d.\n", __func__, update_addr,
                         update_value, registerFile.size(), registerFileSize);
                 updateQueue.pop_front();
-                stats.updateQueueLatency.sample((curTick() - enter_tick) * 1e9 / getClockFrequency());
+                stats.updateQueueLatency.sample(
+                        (curTick() - enter_tick) * 1e9 / getClockFrequency());
                 DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
@@ -220,6 +217,17 @@ WLEngine::processNextReadEvent()
                             update_value, updateQueue.size(), updateQueueSize);
                 checkRetryReq();
                 vertexReadTime[update_addr] = curTick();
+            } else {
+                if (read_status == ReadReturnStatus::REJECT_ROLL) {
+                    updateQueue.pop_front();
+                    updateQueue.emplace_back(
+                                        update_addr, update_value, enter_tick);
+                    DPRINTF(WLEngine, "%s: Received a reject from cache. "
+                                        "Rolling the update.\n", __func__);
+                } else {
+                    DPRINTF(WLEngine, "%s: Received a reject from cache. "
+                                    "Not rolling the update.\n", __func__);
+                }
             }
         } else {
             DPRINTF(WLEngine, "%s: There are no free registers "
@@ -227,7 +235,6 @@ WLEngine::processNextReadEvent()
             stats.registerShortage++;
         }
     } else {
-        // TODO: Generalize this to reduce function rather than just min
         DPRINTF(WLEngine,  "%s: A register has already been allocated for "
                     "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
                 __func__, update_addr, update_addr, registerFile[update_addr]);
@@ -238,7 +245,8 @@ WLEngine::processNextReadEvent()
                     update_value, update_addr, registerFile[update_addr]);
         stats.registerFileCoalesce++;
         updateQueue.pop_front();
-        stats.updateQueueLatency.sample((curTick() - enter_tick) * 1e9 / getClockFrequency());
+        stats.updateQueueLatency.sample(
+                        (curTick() - enter_tick) * 1e9 / getClockFrequency());
         DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index f442d6060e..b5ad3d9040 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -35,6 +35,7 @@
 #include "accl/graph/base/base_reduce_engine.hh"
 #include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/enums.hh"
 #include "base/statistics.hh"
 #include "params/WLEngine.hh"
 
diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh
new file mode 100644
index 0000000000..4102e29cd3
--- /dev/null
+++ b/src/accl/graph/sega/work_directory.hh
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
+#define __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
+
+#include "base/addr_range.hh"
+#include "base/types.hh"
+
+namespace gem5
+{
+
+class WorkDirectory
+{
+  public:
+    virtual void activate(Addr atom_addr) = 0;
+    virtual void deactivate(Addr atom_addr) = 0;
+    virtual Addr getNextWork() = 0;
+
+    virtual int workCount() = 0;
+    bool empty() { return workCount() == 0; }
+
+    virtual void setLastAtomAddr(Addr atom_addr) = 0;
+};
+
+class PopCountDirectory: public WorkDirectory
+{
+  private:
+    AddrRange memoryRange;
+
+    int numAtomsPerBlock;
+    int memoryAtomSize;
+    int blockSize;
+
+    uint32_t _workCount;
+
+    int numCounters;
+    int lastCounterIndex;
+    uint32_t* popCount;
+
+    uint32_t currentIndex;
+    uint32_t currentCounter;
+
+    int getIndexFromAtomAddr(Addr atom_addr)
+    {
+        assert((atom_addr % memoryAtomSize) == 0);
+        Addr trimmed_addr = memoryRange.removeIntlvBits(atom_addr);
+        int index = (int) (trimmed_addr / blockSize);
+        return index;
+    }
+
+    Addr getAtomAddrFromIndex(int block_index, int atom_index)
+    {
+        Addr block_addr = block_index * blockSize;
+        Addr trimmed_addr = block_addr + atom_index * memoryAtomSize;
+        return memoryRange.addIntlvBits(trimmed_addr);
+    }
+
+  public:
+    PopCountDirectory(AddrRange mem_range, int atoms_per_block, int atom_size):
+        WorkDirectory(),
+        memoryRange(mem_range), numAtomsPerBlock(atoms_per_block),
+        memoryAtomSize(atom_size), _workCount(0),
+        currentIndex(0), currentCounter(0)
+    {
+        blockSize = numAtomsPerBlock * memoryAtomSize;
+        int numCounters = (int) (memoryRange.size() / blockSize);
+        lastCounterIndex = numCounters - 1;
+        popCount = new uint32_t [numCounters];
+        for (int index = 0; index < numCounters; index++) {
+            popCount[index] = 0;
+        }
+    }
+
+    // CAUTION: This should only be called when the work
+    // directory **is not** tracking the the atom with atom_addr
+    virtual void activate(Addr atom_addr)
+    {
+        int index = getIndexFromAtomAddr(atom_addr);
+        uint32_t prev_count = popCount[index];
+        popCount[index]++;
+        _workCount++;
+        assert(popCount[index] > prev_count);
+        assert(popCount[index] <= numAtomsPerBlock);
+    }
+
+    // CAUTION: This should only be called when the work
+    // directory **is** tracking the the atom with atom_addr
+    virtual void deactivate(Addr atom_addr)
+    {
+        int index = getIndexFromAtomAddr(atom_addr);
+        uint32_t prev_count = popCount[index];
+        popCount[index]--;
+        _workCount--;
+        assert(popCount[index] < prev_count);
+        assert(popCount[index] <= numAtomsPerBlock);
+    }
+
+    virtual int workCount() { return _workCount; }
+
+    void setLastAtomAddr(Addr atom_addr)
+    {
+        lastCounterIndex = getIndexFromAtomAddr(atom_addr);
+    }
+
+    // CAUTION: If this function returns an addr that
+    // is in the cache, that addr should be ignored.
+    // CAUTION: The receiver should track the last n
+    // addresses that this WorkDirectory has generated.
+    // where n is equal to the size of the entry holding
+    // reads generated by this WorkDirectory. In case
+    // the WorkDirectory generates a repeated address
+    // it should be ignored.
+    // FIXME: This should return garbage if it can't find anything.
+    // virtual Addr getNextWork()
+    // {
+    //     if ((currentCounter == numAtomsPerBlock) ||
+    //         (popCount[currentIndex] == 0)) {
+    //         int prev_index = currentIndex;
+    //         while (true) {
+    //             currentIndex++;
+    //             // NOTE: this is an optimization.
+    //             // lastCounterIndex tracks the last blockOfAtom that
+    //             // has vertices. By default it is set to numCounters - 1.
+    //             // However, it might not be necessary to track all the
+    //             // numCounters counters. e.g. If this WorkDirectory is tracking
+    //             // a 512 MiB memory with atom size of 32 B and 256 atoms
+    //             // per block. Then it needs 64 Ki counters of 8 bit wide.
+    //             // However, if we need 8 Mi atoms to store all our vertices,
+    //             // the second half of the counters would not be used at all
+    //             // (512 MiB hold 16 Mi atoms and we're only using half).
+    //             if (currentIndex > lastCounterIndex) {
+    //                 currentIndex = 0;
+    //             }
+    //             if (prev_index == currentIndex) {
+    //                 // NOTE: If we have reached the same index as before,
+    //                 // we need to decrement the currentCounter to generate
+    //                 // a repeatative address. This way the receiver can detect
+    //                 // the uselessness of the generated address and ignore it
+    //                 currentCounter--;
+    //                 break;
+    //             }
+    //             if (popCount[currentIndex] > 0) {
+    //                 currentCounter = 0;
+    //                 break;
+    //             }
+    //         }
+    //     }
+    //     Addr ret_addr = getAtomAddrFromIndex(currentIndex, currentCounter);
+    //     currentCounter++;
+
+    //     return ret_addr;
+    // }
+
+    virtual Addr getNextWork()
+    {
+        if ((currentCounter == numAtomsPerBlock) ||
+            (popCount[currentIndex] == 0)) {
+            int other_count = _workCount - popCount[currentIndex];
+            if (other_count == 0) {
+                currentCounter = 0;
+            } else {
+                int prev_index = currentIndex;
+                while (true) {
+                    currentIndex++;
+                    if (currentIndex > lastCounterIndex) {
+                        currentIndex = 0;
+                    }
+                    if (currentIndex == prev_index) {
+                        break;
+                    }
+                    if (popCount[currentIndex] > 0) {
+                        break;
+                    }
+                }
+                currentCounter = 0;
+            }
+        }
+        Addr ret_addr = getAtomAddrFromIndex(currentIndex, currentCounter);
+        currentCounter++;
+        return ret_addr;
+    }
+};
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
diff --git a/src/mem/mem_ctrl.cc b/src/mem/mem_ctrl.cc
index c65d68a5a7..3cbacef800 100644
--- a/src/mem/mem_ctrl.cc
+++ b/src/mem/mem_ctrl.cc
@@ -212,7 +212,7 @@ MemCtrl::addToReadQueue(PacketPtr pkt,
     for (int cnt = 0; cnt < pkt_count; ++cnt) {
         unsigned size = std::min((addr | (burst_size - 1)) + 1,
                         base_addr + pkt->getSize()) - addr;
-        stats.readPktSize[ceilLog2(size)]++;
+        // stats.readPktSize[ceilLog2(size)]++;
         stats.readBursts++;
         stats.requestorReadAccesses[pkt->requestorId()]++;
 

From c4fc96e2146aeec5e7a978c11dfd4e5b36a7a67b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 7 Nov 2022 19:53:35 -0800
Subject: [PATCH 212/247] Adding new stats.

---
 configs/accl/sega.py                   | 12 ++++--
 src/accl/graph/sega/CoalesceEngine.py  |  2 -
 src/accl/graph/sega/coalesce_engine.cc | 51 ++++++++++++--------------
 src/accl/graph/sega/coalesce_engine.hh |  4 +-
 src/accl/graph/sega/push_engine.cc     | 16 ++++++--
 src/accl/graph/sega/push_engine.hh     |  5 ++-
 6 files changed, 51 insertions(+), 39 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 54f22b1377..7baa27fd5e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -53,7 +53,6 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
             cache_size=cache_size,
-            num_mshr_entry=64,
             max_resp_per_cycle=8,
             active_buffer_size = 64,
             post_push_wb_queue_size=64,
@@ -61,7 +60,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.push_engine = PushEngine(
             push_req_queue_size=32,
             attached_memory_atom_size=64,
-            resp_queue_size=512,
+            resp_queue_size=4096,
             update_queue_size=32,
         )
 
@@ -74,7 +73,11 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                 range=AddrRange(edge_memory_size), in_addr_map=False
             )
         )
-
+        # self.edge_mem_ctrl = SimpleMemory(latency="90ns",
+        #                                 latency_var="0ns",
+        #                                 bandwidth="18GiB/s",
+        #                                 range=AddrRange(edge_memory_size),
+        #                                 in_addr_map=False)
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
 
@@ -105,6 +108,9 @@ def set_vertex_pch_bit(self, pch_bit):
 
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
+    # def set_edge_image(self, edge_image):
+    #     self.edge_mem_ctrl.image_file = edge_image
+
 
 
 class SEGA(System):
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index a447dedc3d..76e7d262e8 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -37,8 +37,6 @@ class CoalesceEngine(BaseMemoryEngine):
 
     cache_size = Param.MemorySize("Size of the internal SRAM array.")
 
-    num_mshr_entry = Param.Int("Number of MSHR entries.")
-
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
     active_buffer_size = Param.Int("Maximum number of memory active memory "
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 0aa61345f7..d7cf173097 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -46,7 +46,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     BaseMemoryEngine(params), lastAtomAddr(0),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
+    onTheFlyReqs(0),
     maxRespPerCycle(params.max_resp_per_cycle),
     pullsReceived(0), pullsScheduled(0), pendingPullReads(0),
     activeBufferSize(params.active_buffer_size),
@@ -227,7 +227,6 @@ CoalesceEngine::recvWLRead(Addr addr)
         assert(cacheBlocks[block_index].busyMask == 0);
         assert(!cacheBlocks[block_index].dirty);
 
-        assert(MSHR.size() <= numMSHREntries);
         assert(MSHR.find(block_index) != MSHR.end());
         MSHR[block_index].push_back(addr);
         DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to MSHR "
@@ -239,7 +238,6 @@ CoalesceEngine::recvWLRead(Addr addr)
     } else {
         // miss
         assert(cacheBlocks[block_index].addr != aligned_addr);
-        assert(MSHR.size() <= numMSHREntries);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
 
         if (cacheBlocks[block_index].state != CacheState::INVALID) {
@@ -284,29 +282,26 @@ CoalesceEngine::recvWLRead(Addr addr)
         } else {
             // cold miss
             assert(MSHR.find(block_index) == MSHR.end());
-            if (MSHR.size() < numMSHREntries) {
-                cacheBlocks[block_index].addr = aligned_addr;
-                cacheBlocks[block_index].busyMask = 0;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].dirty = false;
-                cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].state = CacheState::PENDING_DATA;
-                cacheBlocks[block_index].lastChangedTick = curTick();
+            cacheBlocks[block_index].addr = aligned_addr;
+            cacheBlocks[block_index].busyMask = 0;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].dirty = false;
+            cacheBlocks[block_index].hasConflict = false;
+            cacheBlocks[block_index].state = CacheState::PENDING_DATA;
+            cacheBlocks[block_index].lastChangedTick = curTick();
 
-                MSHR[block_index].push_back(addr);
-                memoryFunctionQueue.emplace_back(
-                    [this] (int block_index, Tick schedule_tick) {
-                        processNextRead(block_index, schedule_tick);
-                    }, block_index, curTick());
-                if ((!nextMemoryEvent.pending()) &&
-                    (!nextMemoryEvent.scheduled())) {
-                    schedule(nextMemoryEvent, nextCycle());
-                }
-                return ReadReturnStatus::ACCEPT;
-            } else {
-                return ReadReturnStatus::REJECT_ROLL;
+            MSHR[block_index].push_back(addr);
+            memoryFunctionQueue.emplace_back(
+                [this] (int block_index, Tick schedule_tick) {
+                    processNextRead(block_index, schedule_tick);
+                }, block_index, curTick());
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
             }
+            return ReadReturnStatus::ACCEPT;
         }
+        stats.readMisses++;
     }
 }
 
@@ -939,6 +934,8 @@ CoalesceEngine::processNextApplyEvent()
                 owner->recvVertexPush(addr, delta, items[index].edgeIndex,
                                                     items[index].degree);
                 pullsReceived--;
+                stats.verticesPushed++;
+                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
             }
         }
         pkt->deleteData();
@@ -986,6 +983,8 @@ CoalesceEngine::processNextApplyEvent()
                             cacheBlocks[block_index].items[index].edgeIndex,
                             cacheBlocks[block_index].items[index].degree);
                         pullsReceived--;
+                        stats.verticesPushed++;
+                        stats.lastVertexPushTime = curTick() - stats.lastResetTick;
                     }
                 }
 
@@ -1057,8 +1056,6 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache misses."),
     ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
              "Number of cache hit under misses."),
-    ADD_STAT(mshrEntryShortage, statistics::units::Count::get(),
-             "Number of cache rejections caused by entry shortage."),
     ADD_STAT(responsePortShortage, statistics::units::Count::get(),
              "Number of times a response has been "
              "delayed because of port shortage. "),
@@ -1082,7 +1079,7 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
                                             statistics::units::Second>::get(),
              "Rate at which vertices are pushed."),
-    ADD_STAT(bitvectorLength, statistics::units::Count::get(),
+    ADD_STAT(frontierSize, statistics::units::Count::get(),
              "Histogram of the length of the bitvector."),
     ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
              "Histogram of the response latency to WLEngine. (ns)"),
@@ -1103,7 +1100,7 @@ CoalesceEngine::CoalesceStats::regStats()
 
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
 
-    bitvectorLength.init(64);
+    frontierSize.init(64);
     responseQueueLatency.init(64);
     memoryFunctionLatency.init(64);
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c457b214f9..f87e0027a2 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -106,7 +106,6 @@ class CoalesceEngine : public BaseMemoryEngine
     Block* cacheBlocks;
 
     int onTheFlyReqs;
-    int numMSHREntries;
     std::unordered_map<int, std::vector<Addr>> MSHR;
 
     // Response route to WLEngine
@@ -167,7 +166,6 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar readHits;
         statistics::Scalar readMisses;
         statistics::Scalar readHitUnderMisses;
-        statistics::Scalar mshrEntryShortage;
         statistics::Scalar responsePortShortage;
         statistics::Scalar numMemoryBlocks;
         statistics::Scalar verticesPulled;
@@ -180,7 +178,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Formula vertexPullBW;
         statistics::Formula vertexPushBW;
 
-        statistics::Histogram bitvectorLength;
+        statistics::Histogram frontierSize;
         statistics::Histogram responseQueueLatency;
         statistics::Histogram memoryFunctionLatency;
     };
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a17991e335..09f29a43e4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -158,7 +158,7 @@ PushEngine::start()
     // assert(!nextVertexPullEvent.scheduled());
 
     _running = true;
-    stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick);
+    // stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick);
     // NOTE: We might have to check for size availability here.
     assert(workLeft());
     if (vertexSpace() && !nextVertexPullEvent.scheduled()) {
@@ -196,6 +196,7 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta,
                             sizeof(Edge), peerMemoryAtomSize);
 
     edgePointerQueue.emplace_back(info_gen, curTick());
+    stats.edgePointerQueueLength.sample(edgePointerQueue.size());
     numPendingPulls--;
 
     if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
@@ -239,6 +240,7 @@ PushEngine::processNextMemoryReadEvent()
             stats.edgePointerQueueLatency.sample(
                     (curTick() - entrance_tick) * 1e9 / getClockFrequency());
             edgePointerQueue.pop_front();
+            stats.edgePointerQueueLength.sample(edgePointerQueue.size());
             DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. "
             "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size());
         }
@@ -282,6 +284,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
         MetaEdge meta_edge(
                     push_info.src, edge_dst, edge_weight, push_info.value);
         metaEdgeQueue.emplace_back(meta_edge, curTick());
+        stats.edgeQueueLength.sample(metaEdgeQueue.size());
     }
     stats.numWastefulEdgesRead +=
                 (peerMemoryAtomSize / sizeof(Edge)) - push_info.numElements;
@@ -320,6 +323,7 @@ PushEngine::processNextPropagateEvent()
             stats.numPropagates++;
             stats.edgeQueueLatency.sample(
                     (curTick() - entrance_tick) * 1e9 / getClockFrequency());
+            stats.edgeQueueLength.sample(metaEdgeQueue.size());
         } else {
             metaEdgeQueue.emplace_back(meta_edge, entrance_tick);
         }
@@ -466,8 +470,8 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
              "Number of propagate operations done."),
     ADD_STAT(numNetBlocks, statistics::units::Count::get(),
              "Number of updates blocked by network."),
-    ADD_STAT(numIdleCycles, statistics::units::Count::get(),
-             "Number of cycles PushEngine has been idle."),
+    // ADD_STAT(numIdleCycles, statistics::units::Count::get(),
+    //          "Number of cycles PushEngine has been idle."),
     ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(),
              "Number of coalescions in the update queues."),
     ADD_STAT(numUpdates, statistics::units::Count::get(),
@@ -479,8 +483,12 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
              "Traversed Edges Per Second."),
     ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(),
              "Histogram of the latency of the edgePointerQueue."),
+    ADD_STAT(edgePointerQueueLength, statistics::units::Count::get(),
+             "Histogram of the size of the edgePointerQueue."),
     ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
              "Histogram of the latency of the metaEdgeQueue."),
+    ADD_STAT(edgeQueueLength, statistics::units::Count::get(),
+             "Histogram of the size of the metaEdgeQueue."),
     ADD_STAT(updateQueueLength, statistics::units::Count::get(),
              "Histogram of the length of updateQueues."),
     ADD_STAT(numPropagatesHist, statistics::units::Count::get(),
@@ -496,7 +504,9 @@ PushEngine::PushStats::regStats()
     TEPS = numPropagates / simSeconds;
 
     edgePointerQueueLatency.init(64);
+    edgePointerQueueLength.init(64);
     edgeQueueLatency.init(64);
+    edgeQueueLength.init(64);
     updateQueueLength.init(64);
     numPropagatesHist.init(push.params().max_propagates_per_cycle);
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 08cceb14f0..f51865acb3 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -164,9 +164,10 @@ class PushEngine : public BaseMemoryEngine
 
       PushEngine &push;
 
+      statistics::Scalar numMemoryBlocks;
       statistics::Scalar numPropagates;
       statistics::Scalar numNetBlocks;
-      statistics::Scalar numIdleCycles;
+    //   statistics::Scalar numIdleCycles;
       statistics::Scalar updateQueueCoalescions;
       statistics::Scalar numUpdates;
       statistics::Scalar numWastefulEdgesRead;
@@ -174,7 +175,9 @@ class PushEngine : public BaseMemoryEngine
       statistics::Formula TEPS;
 
       statistics::Histogram edgePointerQueueLatency;
+      statistics::Histogram edgePointerQueueLength;
       statistics::Histogram edgeQueueLatency;
+      statistics::Histogram edgeQueueLength;
       statistics::Histogram updateQueueLength;
       statistics::Histogram numPropagatesHist;
     };

From b68602b864a995b5d5a248fb5364f973fc2ace3b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 8 Nov 2022 07:36:05 -0800
Subject: [PATCH 213/247] Adding state.

---
 configs/accl/bfs.py                    | 35 ++++++++++++++++++++------
 configs/accl/sega.py                   |  6 +----
 src/accl/graph/sega/PushEngine.py      |  4 +--
 src/accl/graph/sega/coalesce_engine.cc | 26 +++++++++++++------
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/work_directory.hh  | 10 +++++---
 6 files changed, 57 insertions(+), 25 deletions(-)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index a201acd4d1..80331e3aad 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -47,6 +47,14 @@ def get_inputs():
         default=False,
         help="Print final answer",
     )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample statistics",
+    )
 
     args = argparser.parse_args()
 
@@ -56,24 +64,37 @@ def get_inputs():
         args.graph,
         args.init_addr,
         args.init_value,
+        args.sample,
         args.verify,
     )
 
 
 if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value, verify = get_inputs()
+    num_gpts, cache_size, graph, init_addr, init_value, sample, verify = get_inputs()
 
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
 
-    system.create_pop_count_directory(256)
+    system.create_pop_count_directory(64)
     system.create_bfs_workload(init_addr, init_value)
-    exit_event = m5.simulate()
-    print(
-        f"Exited simulation at tick {m5.curTick()} "
-        + f"because {exit_event.getCause()}"
-    )
+    if sample:
+        while True:
+            exit_event = m5.simulate(10000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
     if verify:
         system.print_answer()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7baa27fd5e..29a017ba65 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -61,6 +61,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
             push_req_queue_size=32,
             attached_memory_atom_size=64,
             resp_queue_size=4096,
+            max_propagates_per_cycle=8,
             update_queue_size=32,
         )
 
@@ -73,11 +74,6 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                 range=AddrRange(edge_memory_size), in_addr_map=False
             )
         )
-        # self.edge_mem_ctrl = SimpleMemory(latency="90ns",
-        #                                 latency_var="0ns",
-        #                                 bandwidth="18GiB/s",
-        #                                 range=AddrRange(edge_memory_size),
-        #                                 in_addr_map=False)
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
 
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 20c5452d43..63fa1eae62 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -42,8 +42,8 @@ class PushEngine(BaseMemoryEngine):
                                     "push engine where it stores the "
                                     "edges read from memory.")
 
-    max_propagates_per_cycle = Param.Int(4, "Maximum number of propagates "
-                                            "done per cycle.")
+    max_propagates_per_cycle = Param.Int("Maximum number of propagates "
+                                                        "done per cycle.")
 
     update_queue_size = Param.Int("Maximum number of entries "
                                     "for each update queue.")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index d7cf173097..adb33064f7 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -239,7 +239,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         // miss
         assert(cacheBlocks[block_index].addr != aligned_addr);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
-
+        stats.readMisses++;
         if (cacheBlocks[block_index].state != CacheState::INVALID) {
             // conflict miss
             DPRINTF(CoalesceEngine, "%s: Addr: %lu has conflict with "
@@ -268,7 +268,9 @@ CoalesceEngine::recvWLRead(Addr addr)
                     }
                     if (atom_active) {
                         activeCacheBlocks.erase(block_index);
-                        directory->activate(cacheBlocks[block_index].addr);
+                        int count = directory->activate(cacheBlocks[block_index].addr);
+                        stats.blockActiveCount.sample(count);
+                        stats.frontierSize.sample(directory->workCount());
                     }
                     // NOTE: Bring the cache line to invalid state.
                     // NOTE: Above line where we set hasConflict to true
@@ -301,7 +303,6 @@ CoalesceEngine::recvWLRead(Addr addr)
             }
             return ReadReturnStatus::ACCEPT;
         }
-        stats.readMisses++;
     }
 }
 
@@ -376,8 +377,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                             cacheBlocks[block_index].items[index]);
             }
             if (atom_active) {
-                directory->deactivate(addr);
+                int count = directory->deactivate(addr);
                 activeCacheBlocks.push_back(block_index);
+                stats.blockActiveCount.sample(count);
+                stats.frontierSize.sample(directory->workCount());
             }
 
             assert(MSHR.find(block_index) != MSHR.end());
@@ -433,8 +436,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 atom_active |= graphWorkload->activeCondition(items[index]);
             }
             if (atom_active) {
-                directory->deactivate(addr);
+                int count = directory->deactivate(addr);
                 activeBuffer.emplace_back(pkt, curTick());
+                stats.blockActiveCount.sample(count);
+                stats.frontierSize.sample(directory->workCount());
                 DPRINTF(MSDebug, "%s: Empalced pkt: %s in activeBuffer. "
                         "activeBuffer.size: %d.\n", __func__,
                         pkt->print(), activeBuffer.size());
@@ -591,7 +596,9 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 }
                 if (atom_active) {
                     activeCacheBlocks.erase(block_index);
-                    directory->activate(cacheBlocks[block_index].addr);
+                    int count = directory->activate(cacheBlocks[block_index].addr);
+                    stats.blockActiveCount.sample(count);
+                    stats.frontierSize.sample(directory->workCount());
                 }
                 cacheBlocks[block_index].reset();
             }
@@ -804,7 +811,9 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
         }
         if (atom_active) {
             activeCacheBlocks.erase(block_index);
-            directory->activate(cacheBlocks[block_index].addr);
+            int count = directory->activate(cacheBlocks[block_index].addr);
+            stats.blockActiveCount.sample(count);
+            stats.frontierSize.sample(directory->workCount());
         }
 
         PacketPtr pkt = createWritePacket(
@@ -1081,6 +1090,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Rate at which vertices are pushed."),
     ADD_STAT(frontierSize, statistics::units::Count::get(),
              "Histogram of the length of the bitvector."),
+    ADD_STAT(blockActiveCount, statistics::units::Count::get(),
+             "Histogram of the popCount values in the directory"),
     ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
              "Histogram of the response latency to WLEngine. (ns)"),
     ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
@@ -1101,6 +1112,7 @@ CoalesceEngine::CoalesceStats::regStats()
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
 
     frontierSize.init(64);
+    blockActiveCount.init(64);
     responseQueueLatency.init(64);
     memoryFunctionLatency.init(64);
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index f87e0027a2..b855fda38b 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -179,6 +179,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Formula vertexPushBW;
 
         statistics::Histogram frontierSize;
+        statistics::Histogram blockActiveCount;
         statistics::Histogram responseQueueLatency;
         statistics::Histogram memoryFunctionLatency;
     };
diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh
index 4102e29cd3..35778686c8 100644
--- a/src/accl/graph/sega/work_directory.hh
+++ b/src/accl/graph/sega/work_directory.hh
@@ -38,8 +38,8 @@ namespace gem5
 class WorkDirectory
 {
   public:
-    virtual void activate(Addr atom_addr) = 0;
-    virtual void deactivate(Addr atom_addr) = 0;
+    virtual int activate(Addr atom_addr) = 0;
+    virtual int deactivate(Addr atom_addr) = 0;
     virtual Addr getNextWork() = 0;
 
     virtual int workCount() = 0;
@@ -99,7 +99,7 @@ class PopCountDirectory: public WorkDirectory
 
     // CAUTION: This should only be called when the work
     // directory **is not** tracking the the atom with atom_addr
-    virtual void activate(Addr atom_addr)
+    virtual int activate(Addr atom_addr)
     {
         int index = getIndexFromAtomAddr(atom_addr);
         uint32_t prev_count = popCount[index];
@@ -107,11 +107,12 @@ class PopCountDirectory: public WorkDirectory
         _workCount++;
         assert(popCount[index] > prev_count);
         assert(popCount[index] <= numAtomsPerBlock);
+        return popCount[index];
     }
 
     // CAUTION: This should only be called when the work
     // directory **is** tracking the the atom with atom_addr
-    virtual void deactivate(Addr atom_addr)
+    virtual int deactivate(Addr atom_addr)
     {
         int index = getIndexFromAtomAddr(atom_addr);
         uint32_t prev_count = popCount[index];
@@ -119,6 +120,7 @@ class PopCountDirectory: public WorkDirectory
         _workCount--;
         assert(popCount[index] < prev_count);
         assert(popCount[index] <= numAtomsPerBlock);
+        return popCount[index];
     }
 
     virtual int workCount() { return _workCount; }

From ec5025f2b3b1143ed9c1663e47464d937705ded3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 8 Nov 2022 15:00:00 -0800
Subject: [PATCH 214/247] Adding stat to count number of conflict misses.

---
 src/accl/graph/sega/coalesce_engine.cc | 3 +++
 src/accl/graph/sega/coalesce_engine.hh | 1 +
 2 files changed, 4 insertions(+)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index adb33064f7..8c636615cd 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -279,6 +279,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                 }
                 return ReadReturnStatus::REJECT_NO_ROLL;
             } else {
+                stats.numConflicts++;
                 return ReadReturnStatus::REJECT_ROLL;
             }
         } else {
@@ -1065,6 +1066,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache misses."),
     ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
              "Number of cache hit under misses."),
+    ADD_STAT(numConflicts, statistics::units::Count::get(),
+             "Number of conflicts raised by reads in the cache."),
     ADD_STAT(responsePortShortage, statistics::units::Count::get(),
              "Number of times a response has been "
              "delayed because of port shortage. "),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index b855fda38b..c2da6a90cd 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -166,6 +166,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar readHits;
         statistics::Scalar readMisses;
         statistics::Scalar readHitUnderMisses;
+        statistics::Scalar numConflicts;
         statistics::Scalar responsePortShortage;
         statistics::Scalar numMemoryBlocks;
         statistics::Scalar verticesPulled;

From ca971137593af82054c428ea6d8bca8e949463d0 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 8 Nov 2022 15:17:20 -0800
Subject: [PATCH 215/247] Adding stat to count the number of update rolls.

---
 src/accl/graph/sega/coalesce_engine.cc | 3 ---
 src/accl/graph/sega/enums.cc           | 3 +--
 src/accl/graph/sega/enums.hh           | 1 -
 src/accl/graph/sega/wl_engine.cc       | 4 ++++
 src/accl/graph/sega/wl_engine.hh       | 1 +
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 8c636615cd..b9ac25c502 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -179,9 +179,6 @@ CoalesceEngine::recvWLRead(Addr addr)
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
-        if (cacheBlocks[block_index].state == CacheState::LOCKED_FOR_APPLY) {
-            return ReadReturnStatus::REJECT_NO_ROLL;
-        }
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
         stats.readHits++;
         assert(cacheBlocks[block_index].state != CacheState::INVALID);
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index 8c9d223178..de5d569c18 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -36,8 +36,7 @@ const char* cacheStateStrings[NUM_CACHE_STATE] = {
     "PENDING_DATA",
     "BUSY",
     "IDLE",
-    "PENDING_WB",
-    "LOCKED_FOR_APPLY"
+    "PENDING_WB"
 };
 
 
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index e7a8f84452..6153386b71 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -39,7 +39,6 @@ enum CacheState
     BUSY,
     IDLE,
     PENDING_WB,
-    LOCKED_FOR_APPLY,
     NUM_CACHE_STATE
 };
 extern const char* cacheStateStrings[NUM_CACHE_STATE];
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 2b305e1557..ed91622b43 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -224,6 +224,7 @@ WLEngine::processNextReadEvent()
                                         update_addr, update_value, enter_tick);
                     DPRINTF(WLEngine, "%s: Received a reject from cache. "
                                         "Rolling the update.\n", __func__);
+                    stats.numUpdateRolls++;
                 } else {
                     DPRINTF(WLEngine, "%s: Received a reject from cache. "
                                     "Not rolling the update.\n", __func__);
@@ -330,6 +331,9 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
     ADD_STAT(registerShortage, statistics::units::Count::get(),
              "Number of times updates were "
              "stalled because of register shortage"),
+    ADD_STAT(numUpdateRolls, statistics::units::Count::get(),
+             "Number of times an update has been rolled back "
+             "to the back of the update queue due to cache reject."),
     ADD_STAT(vertexReadLatency, statistics::units::Second::get(),
              "Histogram of the latency of reading a vertex (ns)."),
     ADD_STAT(updateQueueLatency, statistics::units::Second::get(),
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index b5ad3d9040..45baaa1e79 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -101,6 +101,7 @@ class WLEngine : public BaseReduceEngine
       statistics::Scalar numReduce;
       statistics::Scalar registerFileCoalesce;
       statistics::Scalar registerShortage;
+      statistics::Scalar numUpdateRolls;
 
       statistics::Histogram vertexReadLatency;
       statistics::Histogram updateQueueLatency;

From fd1561f7435537165a458e3aac7afded87904475 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 8 Nov 2022 19:47:35 -0800
Subject: [PATCH 216/247] Removing unnecessary comments.

---
 src/accl/graph/sega/coalesce_engine.cc | 52 +++-----------------------
 1 file changed, 5 insertions(+), 47 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b9ac25c502..98229dde24 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -86,6 +86,9 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
         int block_index = getBlockIndex(addr);
 
         // FIXME: Check postPushWBQueue for hits
+        // Is it really the case though. I don't think at this time
+        // beacuse we check done after handleMemResp and make sure all
+        // the writes to memory are done before scheduling an exit event
         if ((cacheBlocks[block_index].addr == addr) &&
             (cacheBlocks[block_index].valid)) {
             assert(cacheBlocks[block_index].state == CacheState::IDLE);
@@ -438,23 +441,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 activeBuffer.emplace_back(pkt, curTick());
                 stats.blockActiveCount.sample(count);
                 stats.frontierSize.sample(directory->workCount());
-                DPRINTF(MSDebug, "%s: Empalced pkt: %s in activeBuffer. "
-                        "activeBuffer.size: %d.\n", __func__,
-                        pkt->print(), activeBuffer.size());
             } else {
                 delete pkt;
             }
-            // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
-            //     memoryFunctionQueue.emplace_back(
-            //         [this] (int ignore, Tick schedule_tick) {
-            //             processNextVertexPull(ignore, schedule_tick);
-            //         }, 0, curTick());
-            //     if ((!nextMemoryEvent.pending()) &&
-            //         (!nextMemoryEvent.scheduled())) {
-            //         schedule(nextMemoryEvent, nextCycle());
-            //     }
-            //     pullsScheduled++;
-            // }
+
             if (pullCondition()) {
                 memoryFunctionQueue.emplace_back(
                     [this] (int ignore, Tick schedule_tick) {
@@ -685,9 +675,6 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             need_send_pkt = false;
             wb = postPushWBQueue.erase(wb);
             delete wb_pkt;
-            DPRINTF(MSDebug, "%s: Found addr: %lu in postPushWBQueue. "
-                        "postPushWBQueue.size: %d.\n", __func__,
-                        cacheBlocks[block_index].addr, postPushWBQueue.size());
         } else {
             wb++;
         }
@@ -707,16 +694,6 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             need_send_pkt = false;
             ab = activeBuffer.erase(ab);
             delete ab_pkt;
-            // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
-            //     memoryFunctionQueue.emplace_back(
-            //         [this] (int ignore, Tick schedule_tick) {
-            //             processNextVertexPull(ignore, schedule_tick);
-            //         }, 0, curTick());
-            //     pullsScheduled++;
-            // }
-            DPRINTF(MSDebug, "%s: Found addr: %lu in activeBuffer. "
-                        "activeBuffer.size: %d.\n", __func__,
-                        cacheBlocks[block_index].addr, activeBuffer.size());
             if (pullCondition()) {
                 memoryFunctionQueue.emplace_back(
                     [this] (int ignore, Tick schedule_tick) {
@@ -841,6 +818,7 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
     if (postPushWBQueue.empty()) {
         return;
     }
+
     PacketPtr wb_pkt;
     Tick pkt_tick;
     std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
@@ -848,9 +826,6 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
         memPort.sendPacket(wb_pkt);
         onTheFlyReqs++;
         postPushWBQueue.pop_front();
-        DPRINTF(MSDebug, "%s: Popped pkt: %s from postPushWBQueue. "
-                        "postPushWBQueue.size: %d.\n", __func__,
-                        wb_pkt->print(), postPushWBQueue.size());
     }
 }
 
@@ -958,13 +933,7 @@ CoalesceEngine::processNextApplyEvent()
             PacketPtr wb_pkt = createWritePacket(pkt->getAddr(),
                                         peerMemoryAtomSize, (uint8_t*) items);
             postPushWBQueue.emplace_back(wb_pkt, curTick());
-            DPRINTF(MSDebug, "%s: Empalced pkt: %s in postPushWBQueue. "
-                            "postPushWBQueue.size: %d.\n", __func__,
-                            wb_pkt->print(), postPushWBQueue.size());
             activeBuffer.pop_front();
-            DPRINTF(MSDebug, "%s: Popped pkt: %s from activeBuffer. "
-                        "activeBuffer.size: %d.\n", __func__,
-                        pkt->print(), activeBuffer.size());
             memoryFunctionQueue.emplace_back(
                 [this] (int ignore, Tick schedule_tick) {
                     processNextPostPushWB(ignore, schedule_tick);
@@ -1020,17 +989,6 @@ CoalesceEngine::processNextApplyEvent()
                         "work to apply.\n", __func__);
     }
 
-    // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
-    //     memoryFunctionQueue.emplace_back(
-    //         [this] (int ignore, Tick schedule_tick) {
-    //             processNextVertexPull(ignore, schedule_tick);
-    //         }, 0, curTick());
-    //     if ((!nextMemoryEvent.pending()) &&
-    //         (!nextMemoryEvent.scheduled())) {
-    //         schedule(nextMemoryEvent, nextCycle());
-    //     }
-    //     pullsScheduled++;
-    // }
     if (pullCondition()) {
         memoryFunctionQueue.emplace_back(
             [this] (int ignore, Tick schedule_tick) {

From 1124f5be5c9272df474387555d95f4e0603486c1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 8 Nov 2022 22:17:20 -0800
Subject: [PATCH 217/247] Removing comments.

---
 src/accl/graph/sega/work_directory.hh | 103 ++++++++------------------
 1 file changed, 30 insertions(+), 73 deletions(-)

diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh
index 35778686c8..18430aee0d 100644
--- a/src/accl/graph/sega/work_directory.hh
+++ b/src/accl/graph/sega/work_directory.hh
@@ -29,6 +29,9 @@
 #ifndef __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
 #define __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
 
+#include <iostream>
+
+#include "accl/graph/base/data_structs.hh"
 #include "base/addr_range.hh"
 #include "base/types.hh"
 
@@ -63,9 +66,11 @@ class PopCountDirectory: public WorkDirectory
     int lastCounterIndex;
     uint32_t* popCount;
 
-    uint32_t currentIndex;
+    uint32_t prevIndex;
     uint32_t currentCounter;
 
+    UniqueFIFO<int> activeBlockIndices;
+
     int getIndexFromAtomAddr(Addr atom_addr)
     {
         assert((atom_addr % memoryAtomSize) == 0);
@@ -86,7 +91,7 @@ class PopCountDirectory: public WorkDirectory
         WorkDirectory(),
         memoryRange(mem_range), numAtomsPerBlock(atoms_per_block),
         memoryAtomSize(atom_size), _workCount(0),
-        currentIndex(0), currentCounter(0)
+        prevIndex(-1), currentCounter(0)
     {
         blockSize = numAtomsPerBlock * memoryAtomSize;
         int numCounters = (int) (memoryRange.size() / blockSize);
@@ -105,6 +110,7 @@ class PopCountDirectory: public WorkDirectory
         uint32_t prev_count = popCount[index];
         popCount[index]++;
         _workCount++;
+        activeBlockIndices.push_back(index);
         assert(popCount[index] > prev_count);
         assert(popCount[index] <= numAtomsPerBlock);
         return popCount[index];
@@ -118,6 +124,9 @@ class PopCountDirectory: public WorkDirectory
         uint32_t prev_count = popCount[index];
         popCount[index]--;
         _workCount--;
+        if (popCount[index] == 0) {
+            activeBlockIndices.erase(index);
+        }
         assert(popCount[index] < prev_count);
         assert(popCount[index] <= numAtomsPerBlock);
         return popCount[index];
@@ -130,80 +139,28 @@ class PopCountDirectory: public WorkDirectory
         lastCounterIndex = getIndexFromAtomAddr(atom_addr);
     }
 
-    // CAUTION: If this function returns an addr that
-    // is in the cache, that addr should be ignored.
-    // CAUTION: The receiver should track the last n
-    // addresses that this WorkDirectory has generated.
-    // where n is equal to the size of the entry holding
-    // reads generated by this WorkDirectory. In case
-    // the WorkDirectory generates a repeated address
-    // it should be ignored.
-    // FIXME: This should return garbage if it can't find anything.
-    // virtual Addr getNextWork()
-    // {
-    //     if ((currentCounter == numAtomsPerBlock) ||
-    //         (popCount[currentIndex] == 0)) {
-    //         int prev_index = currentIndex;
-    //         while (true) {
-    //             currentIndex++;
-    //             // NOTE: this is an optimization.
-    //             // lastCounterIndex tracks the last blockOfAtom that
-    //             // has vertices. By default it is set to numCounters - 1.
-    //             // However, it might not be necessary to track all the
-    //             // numCounters counters. e.g. If this WorkDirectory is tracking
-    //             // a 512 MiB memory with atom size of 32 B and 256 atoms
-    //             // per block. Then it needs 64 Ki counters of 8 bit wide.
-    //             // However, if we need 8 Mi atoms to store all our vertices,
-    //             // the second half of the counters would not be used at all
-    //             // (512 MiB hold 16 Mi atoms and we're only using half).
-    //             if (currentIndex > lastCounterIndex) {
-    //                 currentIndex = 0;
-    //             }
-    //             if (prev_index == currentIndex) {
-    //                 // NOTE: If we have reached the same index as before,
-    //                 // we need to decrement the currentCounter to generate
-    //                 // a repeatative address. This way the receiver can detect
-    //                 // the uselessness of the generated address and ignore it
-    //                 currentCounter--;
-    //                 break;
-    //             }
-    //             if (popCount[currentIndex] > 0) {
-    //                 currentCounter = 0;
-    //                 break;
-    //             }
-    //         }
-    //     }
-    //     Addr ret_addr = getAtomAddrFromIndex(currentIndex, currentCounter);
-    //     currentCounter++;
-
-    //     return ret_addr;
-    // }
-
+    // CAUTION: This directory only tracks active vertices in the memory
+    // and it does not have any information on the state of the cache and/or
+    // the active buffer or the write buffer. Therefore, it might generate a
+    // read request to an address that might be in any of those. In that case,
+    // the generated address should be ignored.
     virtual Addr getNextWork()
     {
-        if ((currentCounter == numAtomsPerBlock) ||
-            (popCount[currentIndex] == 0)) {
-            int other_count = _workCount - popCount[currentIndex];
-            if (other_count == 0) {
-                currentCounter = 0;
-            } else {
-                int prev_index = currentIndex;
-                while (true) {
-                    currentIndex++;
-                    if (currentIndex > lastCounterIndex) {
-                        currentIndex = 0;
-                    }
-                    if (currentIndex == prev_index) {
-                        break;
-                    }
-                    if (popCount[currentIndex] > 0) {
-                        break;
-                    }
-                }
-                currentCounter = 0;
-            }
+        // Why ask directory if it's empty?
+        assert(!activeBlockIndices.empty());
+        int front_index = activeBlockIndices.front();
+        assert(popCount[front_index] > 0);
+        if ((prevIndex != -1) && (prevIndex != front_index)) {
+            currentCounter = 0;
+        }
+        if (currentCounter == numAtomsPerBlock) {
+            currentCounter = 0;
+            activeBlockIndices.pop_front();
+            activeBlockIndices.push_back(front_index);
         }
-        Addr ret_addr = getAtomAddrFromIndex(currentIndex, currentCounter);
+        int current_index = activeBlockIndices.front();
+        Addr ret_addr = getAtomAddrFromIndex(current_index, currentCounter);
+        prevIndex = current_index;
         currentCounter++;
         return ret_addr;
     }

From c2b08a68d27767737a489c72e7fcf7d80be10bc2 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 9 Nov 2022 09:05:29 -0800
Subject: [PATCH 218/247] Adding pr and updating config scripts.

---
 configs/accl/bfs.py                        |  24 ++--
 configs/accl/pr-sample.py                  | 109 --------------
 configs/accl/pr.py                         |  44 +++++-
 configs/accl/sega.py                       |  36 +++--
 src/accl/graph/base/graph_workload.cc      | 157 +++++++++------------
 src/accl/graph/base/graph_workload.hh      |  38 ++---
 src/accl/graph/sega/CenteralController.py  |   2 +-
 src/accl/graph/sega/CoalesceEngine.py      |   1 +
 src/accl/graph/sega/centeral_controller.cc |  10 +-
 src/accl/graph/sega/centeral_controller.hh |   2 +-
 src/accl/graph/sega/coalesce_engine.cc     |  53 ++++---
 src/accl/graph/sega/coalesce_engine.hh     |   5 +-
 12 files changed, 201 insertions(+), 280 deletions(-)
 delete mode 100644 configs/accl/pr-sample.py

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 80331e3aad..829449c599 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -40,20 +40,20 @@ def get_inputs():
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
     argparser.add_argument(
-        "--verify",
-        dest="verify",
+        "--sample",
+        dest="sample",
         action="store_const",
         const=True,
         default=False,
-        help="Print final answer",
+        help="Sample sim stats every 100us",
     )
     argparser.add_argument(
-        "--sample",
-        dest="sample",
+        "--verify",
+        dest="verify",
         action="store_const",
         const=True,
         default=False,
-        help="Sample statistics",
+        help="Print final answer",
     )
 
     args = argparser.parse_args()
@@ -70,7 +70,15 @@ def get_inputs():
 
 
 if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value, sample, verify = get_inputs()
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        init_addr,
+        init_value,
+        sample,
+        verify,
+    ) = get_inputs()
 
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
@@ -81,7 +89,7 @@ def get_inputs():
     system.create_bfs_workload(init_addr, init_value)
     if sample:
         while True:
-            exit_event = m5.simulate(10000000)
+            exit_event = m5.simulate(100000000)
             print(
                 f"Exited simulation at tick {m5.curTick()} "
                 + f"because {exit_event.getCause()}"
diff --git a/configs/accl/pr-sample.py b/configs/accl/pr-sample.py
deleted file mode 100644
index ac3616dc84..0000000000
--- a/configs/accl/pr-sample.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from sega import SEGA
-
-import m5
-import argparse
-
-from m5.objects import *
-
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("alpha", type=float)
-    argparser.add_argument("threshold", type=float)
-    argparser.add_argument(
-        "--verify",
-        dest="verify",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Print final answer",
-    )
-    argparser.add_argument(
-        "--sample",
-        dest="sample",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Sample sim stats every 10us",
-    )
-
-    args = argparser.parse_args()
-
-    return (
-        args.num_gpts,
-        args.cache_size,
-        args.graph,
-        args.alpha,
-        args.threshold,
-        args.verify,
-        args.sample,
-    )
-
-
-if __name__ == "__m5_main__":
-    (
-        num_gpts,
-        cache_size,
-        graph,
-        alpha,
-        threshold,
-        verify,
-        sample,
-    ) = get_inputs()
-
-    system = SEGA(num_gpts, cache_size, graph)
-    root = Root(full_system=False, system=system)
-
-    m5.instantiate()
-
-    system.create_pr_workload(alpha, threshold)
-
-    if sample:
-        while True:
-            exit_event = m5.simulate(10000000)
-            print(
-                f"Exited simulation at tick {m5.curTick()} "
-                + f"because {exit_event.getCause()}"
-            )
-            m5.stats.dump()
-            m5.stats.reset()
-            print(exit_event.getCause())
-            if exit_event.getCause() != "simulate() limit reached":
-                break
-    else:
-        exit_event = m5.simulate()
-        print(
-            f"Exited simulation at tick {m5.curTick()} "
-            + f"because {exit_event.getCause()}"
-        )
-    if verify:
-        system.print_answer()
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index 59e8b924c6..e852e47561 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -39,6 +39,14 @@ def get_inputs():
     argparser.add_argument("graph", type=str)
     argparser.add_argument("alpha", type=float)
     argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
     argparser.add_argument(
         "--verify",
         dest="verify",
@@ -56,23 +64,45 @@ def get_inputs():
         args.graph,
         args.alpha,
         args.threshold,
+        args.sample,
         args.verify,
     )
 
-
 if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, alpha, threshold, verify = get_inputs()
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        alpha,
+        threshold,
+        sample,
+        verify,
+    ) = get_inputs()
 
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
 
+    system.create_pop_count_directory(64)
     system.create_pr_workload(alpha, threshold)
-    exit_event = m5.simulate()
-    print(
-        f"Exited simulation at tick {m5.curTick()} "
-        + f"because {exit_event.getCause()}"
-    )
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            print(exit_event.getCause())
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
     if verify:
         system.print_answer()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 29a017ba65..7831302228 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -47,14 +47,18 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 
 
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str, cache_size: str):
+    def __init__(
+        self, edge_memory_size: str, cache_size: str, simple_mem: bool = False
+    ):
         super().__init__()
+        self._simple_mem = simple_mem
         self.wl_engine = WLEngine(update_queue_size=128, register_file_size=64)
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
             cache_size=cache_size,
             max_resp_per_cycle=8,
-            active_buffer_size = 64,
+            pending_pull_limit=32,
+            active_buffer_size=64,
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
@@ -65,9 +69,15 @@ def __init__(self, edge_memory_size: str, cache_size: str):
             update_queue_size=32,
         )
 
-        self.vertex_mem_ctrl = HBMCtrl(
-            dram=HBM_2000_4H_1x64(), dram_2=HBM_2000_4H_1x64()
-        )
+        if self._simple_mem:
+            self.vertex_mem_ctrl = SimpleMemory(
+                latency="122ns", latency_var="0ns", bandwidth="28GiB/s"
+            )
+        else:
+            self.vertex_mem_ctrl = HBMCtrl(
+                dram=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96),
+                dram_2=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96)
+            )
 
         self.edge_mem_ctrl = MemCtrl(
             dram=DDR4_2400_8x8(
@@ -96,18 +106,20 @@ def setReqPort(self, port):
         self.push_engine.out_ports = port
 
     def set_vertex_range(self, vertex_ranges):
-        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
-        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+        if self._simple_mem:
+            self.vertex_mem_ctrl.range = vertex_ranges[0]
+        else:
+            self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+            self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
 
     def set_vertex_pch_bit(self, pch_bit):
-        self.vertex_mem_ctrl.pch_bit = pch_bit
+        if self._simple_mem:
+            pass
+        else:
+            self.vertex_mem_ctrl.pch_bit = pch_bit
 
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
-    # def set_edge_image(self, edge_image):
-    #     self.edge_mem_ctrl.image_file = edge_image
-
-
 
 class SEGA(System):
     def __init__(self, num_mpus, cache_size, graph_path):
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 446509201f..0539296cce 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -113,92 +113,75 @@ BFSWorkload::printWorkListItem(const WorkListItem wl)
             );
 }
 
-// PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
-//     GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size)
-// {
-//     numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
-// }
-
-// void
-// PRWorkload::init(PacketPtr pkt, int bit_index_base,
-//                 std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-//                 std::deque<int>& activeBits,
-//                 int& _workCount)
-// {
-//     WorkListItem items[numElementsPerLine];
-
-//     pkt->writeDataToBlock((uint8_t*) items, atomSize);
-//     for (int i = 0; i < numElementsPerLine; i++) {
-//         items[i].tempProp = readFromFloat<uint32_t>(0);
-//         items[i].prop = readFromFloat<uint32_t>(1 - alpha);
-//         if (items[i].degree > 0) {
-//             needsPush[bit_index_base + i] = 1;
-//             activeBits.push_back(bit_index_base + i);
-//             _workCount++;
-//         }
-//     }
-//     pkt->deleteData();
-//     pkt->allocate();
-//     pkt->setDataFromBlock((uint8_t*) items, atomSize);
-// }
-
-// uint32_t
-// PRWorkload::reduce(uint32_t update, uint32_t value)
-// {
-//     float update_float = writeToFloat<uint32_t>(update);
-//     float value_float = writeToFloat<uint32_t>(value);
-//     return readFromFloat<uint32_t>(update_float + value_float);
-// }
-
-// uint32_t
-// PRWorkload::propagate(uint32_t value, uint32_t weight)
-// {
-//     float value_float = writeToFloat<uint32_t>(value);
-//     float weight_float = 1.0;
-
-//     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
-// }
-
-// bool
-// PRWorkload::applyCondition(WorkListItem wl)
-// {
-//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//     float prop_float = writeToFloat<uint32_t>(wl.prop);
-//     float dist = std::abs(temp_float - prop_float);
-//     return dist >= threshold;
-// }
-
-// bool
-// PRWorkload::preWBApply(WorkListItem& wl)
-// {
-//     if (applyCondition(wl) && (wl.degree > 0)) {
-//         return true;
-//     }
-//     return false;
-// }
-
-// std::tuple<uint32_t, bool, bool>
-// PRWorkload::apply(WorkListItem& wl)
-// {
-//     if (applyCondition(wl)) {
-//         float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//         float prop_float = writeToFloat<uint32_t>(wl.prop);
-//         float delta = (temp_float - prop_float) / wl.degree;
-//         uint32_t delta_uint = readFromFloat<uint32_t>(delta);
-//         wl.prop = wl.tempProp;
-//         return std::make_tuple(delta_uint, true, true);
-//     }
-//     return std::make_tuple(0, false, false);
-// }
-
-// std::string
-// PRWorkload::printWorkListItem(const WorkListItem wl)
-// {
-//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//     return csprintf(
-//             "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
-//             temp_float, temp_float, wl.degree, wl.edgeIndex
-//             );
-// }
+void
+PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    size_t pkt_size = pkt->getSize();
+    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+    WorkListItem items[num_elements];
+
+    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    bool atom_active = false;
+    for (int i = 0; i < num_elements; i++) {
+        items[i].tempProp = readFromFloat<uint32_t>(0);
+        items[i].prop = readFromFloat<uint32_t>(1 - alpha);
+        atom_active |= activeCondition(items[i]);
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+}
+
+uint32_t
+PRWorkload::reduce(uint32_t update, uint32_t value)
+{
+    float update_float = writeToFloat<uint32_t>(update);
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(update_float + value_float);
+}
+
+uint32_t
+PRWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    float value_float = writeToFloat<uint32_t>(value);
+    float weight_float = writeToFloat<uint32_t>(weight);
+    if (weight == 0) {
+        weight_float = 1.0;
+    }
+    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+}
+
+bool
+PRWorkload::activeCondition(WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float dist = std::abs(temp_float - prop_float);
+    return dist >= threshold;
+}
+
+uint32_t
+PRWorkload::apply(WorkListItem& wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float delta = (temp_float - prop_float) / wl.degree;
+    uint32_t delta_uint = readFromFloat<uint32_t>(delta);
+    wl.prop = wl.tempProp;
+    return delta_uint;
+}
+
+std::string
+PRWorkload::printWorkListItem(const WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    return csprintf(
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
+            temp_float, prop_float, wl.degree, wl.edgeIndex);
+}
 
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index f71955bd16..f335ad9b47 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -77,24 +77,26 @@ class BFSWorkload : public GraphWorkload
 };
 
 
-// class PRWorkload : public GraphWorkload
-// {
-//   private:
-//     float alpha;
-//     float threshold;
-
-//   public:
-//     PRWorkload(float alpha, float threshold);
-
-//     ~PRWorkload() {}
-
-//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
-//     virtual uint32_t reduce(uint32_t update, uint32_t value);
-//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
-//     virtual uint32_t apply(WorkListItem& wl);
-//     virtual bool activeCondition(WorkListItem wl);
-//     virtual std::string printWorkListItem(const WorkListItem wl);
-// };
+class PRWorkload : public GraphWorkload
+{
+  private:
+    float alpha;
+    float threshold;
+
+  public:
+    PRWorkload(float alpha, float threshold):
+        alpha(alpha), threshold(threshold)
+    {}
+
+    ~PRWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
 
 }
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 0c21833a05..09a997696d 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -43,6 +43,6 @@ class CenteralController(ClockedObject):
 
     cxx_exports = [
                     PyBindMethod("createBFSWorkload"),
-                    # PyBindMethod("createPRWorkload"),
+                    PyBindMethod("createPRWorkload"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 76e7d262e8..c2393c2f1e 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -39,6 +39,7 @@ class CoalesceEngine(BaseMemoryEngine):
 
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
+    pending_pull_limit = Param.Int("Maximum number of pending pull processes.")
     active_buffer_size = Param.Int("Maximum number of memory active memory "
                                 "atoms ready to send updates. This parameter "
                                 "and post_push_wb_queue_size should be set "
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 883992e64e..60c78559e4 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -110,11 +110,11 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
     workload = new BFSWorkload(init_addr, init_value);
 }
 
-// void
-// CenteralController::createPRWorkload(float alpha, float threshold)
-// {
-//     workload = new PRWorkload(alpha, threshold, system->cacheLineSize());
-// }
+void
+CenteralController::createPRWorkload(float alpha, float threshold)
+{
+    workload = new PRWorkload(alpha, threshold);
+}
 
 void
 CenteralController::recvDoneSignal()
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 6eb07dbcac..ae2980d050 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -63,7 +63,7 @@ class CenteralController : public ClockedObject
     virtual void startup() override;
 
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
-    // void createPRWorkload(float alpha, float threshold);
+    void createPRWorkload(float alpha, float threshold);
 
     void recvDoneSignal();
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 98229dde24..8ac40198be 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -46,10 +46,10 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     BaseMemoryEngine(params), lastAtomAddr(0),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    onTheFlyReqs(0),
-    maxRespPerCycle(params.max_resp_per_cycle),
-    pullsReceived(0), pullsScheduled(0), pendingPullReads(0),
-    activeBufferSize(params.active_buffer_size),
+    onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle),
+    pullsReceived(0), pullsScheduled(0),
+    pendingPullLimit(params.pending_pull_limit),
+    pendingPullReads(0), activeBufferSize(params.active_buffer_size),
     postPushWBQueueSize(params.post_push_wb_queue_size),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
@@ -129,29 +129,17 @@ CoalesceEngine::done()
 }
 
 bool
-CoalesceEngine::timeToPull()
+CoalesceEngine::enoughSpace()
 {
-    return (activeBuffer.size() + pendingPullReads) < activeBufferSize;
-}
-
-bool
-CoalesceEngine::canSchedulePull()
-{
-    // TODO: Maybe a good idea to change this to
-    // activeBuffer.size() + pendingPullReads + pullsScheduled < activeBufferSize
-    return pullsScheduled < 1;
-}
-
-bool
-CoalesceEngine::workLeftInMem()
-{
-    return !directory->empty();
+    return (activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize;
 }
 
 bool
 CoalesceEngine::pullCondition()
 {
-    return ((activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize);
+    bool enough_space = enoughSpace();
+    bool schedule_limit = pullsScheduled < pendingPullLimit;
+    return enough_space && schedule_limit;
 }
 
 // addr should be aligned to peerMemoryAtomSize
@@ -784,12 +772,6 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
             atom_active |= graphWorkload->activeCondition(
                                         cacheBlocks[block_index].items[index]);
         }
-        if (atom_active) {
-            activeCacheBlocks.erase(block_index);
-            int count = directory->activate(cacheBlocks[block_index].addr);
-            stats.blockActiveCount.sample(count);
-            stats.frontierSize.sample(directory->workCount());
-        }
 
         PacketPtr pkt = createWritePacket(
                 cacheBlocks[block_index].addr, peerMemoryAtomSize,
@@ -797,8 +779,21 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
         DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
                         "Addr: %lu, size = %d.\n", __func__,
                         pkt->getAddr(), pkt->getSize());
-        memPort.sendPacket(pkt);
-        onTheFlyReqs++;
+        if (atom_active) {
+            activeCacheBlocks.erase(block_index);
+            if (enoughSpace()) {
+                activeBuffer.emplace_back(pkt, curTick());
+            } else {
+                int count = directory->activate(cacheBlocks[block_index].addr);
+                stats.blockActiveCount.sample(count);
+                stats.frontierSize.sample(directory->workCount());
+                memPort.sendPacket(pkt);
+                onTheFlyReqs++;
+            }
+        } else {
+            memPort.sendPacket(pkt);
+            onTheFlyReqs++;
+        }
         cacheBlocks[block_index].reset();
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c2da6a90cd..f605704b6d 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -118,6 +118,7 @@ class CoalesceEngine : public BaseMemoryEngine
     UniqueFIFO<int> activeCacheBlocks;
 
     int pullsScheduled;
+    int pendingPullLimit;
     int pendingPullReads;
     // A map from addr to sendMask. sendMask determines which bytes to
     // send for push when getting the read response from memory.
@@ -128,9 +129,7 @@ class CoalesceEngine : public BaseMemoryEngine
     std::deque<std::tuple<PacketPtr, Tick>> activeBuffer;
     std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
 
-    bool timeToPull();
-    bool canSchedulePull();
-    bool workLeftInMem();
+    bool enoughSpace();
     bool pullCondition();
     int getBlockIndex(Addr addr);
 

From ccaa539854ee30fc4ea9e6289968ddcf9700edf1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 9 Nov 2022 21:24:39 -0800
Subject: [PATCH 219/247] Updating activeCondition for PR.

---
 src/accl/graph/base/graph_workload.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 0539296cce..05c8d05089 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -160,7 +160,7 @@ PRWorkload::activeCondition(WorkListItem wl)
     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
     float prop_float = writeToFloat<uint32_t>(wl.prop);
     float dist = std::abs(temp_float - prop_float);
-    return dist >= threshold;
+    return (dist >= threshold) && (wl.degree > 0);
 }
 
 uint32_t

From 3747d9f40e7dd23a7e958621090b02ba58cd79c9 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 13 Nov 2022 15:36:40 -0800
Subject: [PATCH 220/247] Adding SSSP and CC

---
 src/accl/graph/base/graph_workload.cc | 172 ++++++++++++++++++++++++++
 src/accl/graph/base/graph_workload.hh |  58 +++++++++
 2 files changed, 230 insertions(+)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 05c8d05089..e36c074da9 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -113,6 +113,121 @@ BFSWorkload::printWorkListItem(const WorkListItem wl)
             );
 }
 
+void
+BFSVisitedWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    size_t pkt_size = pkt->getSize();
+    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
+
+    if (pkt->getAddr() == aligned_addr) {
+        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+        WorkListItem items[num_elements];
+
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+
+        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
+        items[index].tempProp = initValue;
+        if (activeCondition(items[index])) {
+            dir->activate(aligned_addr);
+        }
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+    }
+}
+
+uint32_t
+BFSVisitedWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return std::min(update, value);
+}
+
+uint32_t
+BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return 1;
+}
+
+bool
+BFSVisitedWorkload::activeCondition(WorkListItem wl)
+{
+    return (wl.tempProp < wl.prop) && (wl.degree > 0);
+}
+
+uint32_t
+BFSVisitedWorkload::apply(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    return wl.prop;
+}
+
+std::string
+BFSVisitedWorkload::printWorkListItem(const WorkListItem wl)
+{
+    return csprintf(
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
+            );
+}
+
+void
+SSSPWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    size_t pkt_size = pkt->getSize();
+    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
+
+    if (pkt->getAddr() == aligned_addr) {
+        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+        WorkListItem items[num_elements];
+
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+
+        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
+        items[index].tempProp = initValue;
+        if (activeCondition(items[index])) {
+            dir->activate(aligned_addr);
+        }
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+    }
+}
+
+uint32_t
+SSSPWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return std::min(update, value);
+}
+
+uint32_t
+SSSPWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value + weight;
+}
+
+bool
+SSSPWorkload::activeCondition(WorkListItem wl)
+{
+    return (wl.tempProp < wl.prop) && (wl.degree > 0);
+}
+
+uint32_t
+SSSPWorkload::apply(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    return wl.prop;
+}
+
+std::string
+SSSPWorkload::printWorkListItem(const WorkListItem wl)
+{
+    return csprintf(
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
+            );
+}
+
+
 void
 PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
@@ -184,4 +299,61 @@ PRWorkload::printWorkListItem(const WorkListItem wl)
             temp_float, prop_float, wl.degree, wl.edgeIndex);
 }
 
+void
+CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    Addr pkt_addr = pkt->getAddr();
+    size_t pkt_size = pkt->getSize();
+    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+    WorkListItem items[num_elements];
+
+    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    bool atom_active = false;
+    for (int i = 0; i < num_elements; i++) {
+        items[i].tempProp = (int) ( pkt_addr / sizeof(WorkListItem)) + i;
+        items[i].prop = -1;
+        atom_active |= activeCondition(items[i]);
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+}
+
+uint32_t
+CCWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return std::min(update, value);
+}
+
+uint32_t
+CCWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value;
+}
+
+bool
+CCWorkload::activeCondition(WorkListItem wl)
+{
+    return (wl.tempProp < wl.prop) && (wl.degree > 0);
+}
+
+uint32_t
+CCWorkload::apply(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    return wl.prop;
+}
+
+std::string
+CCWorkload::printWorkListItem(const WorkListItem wl)
+{
+    return csprintf(
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
+            );
+}
+
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index f335ad9b47..de2877d6e8 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -76,6 +76,48 @@ class BFSWorkload : public GraphWorkload
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
+class BFSVisitedWorkload : public GraphWorkload
+{
+  private:
+    uint64_t initAddr;
+    uint32_t initValue;
+
+  public:
+    BFSVisitedWorkload(uint64_t init_addr, uint32_t init_value):
+        initAddr(init_addr), initValue(init_value)
+    {}
+
+    ~BFSVisitedWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
+class SSSPWorkload : public GraphWorkload
+{
+  private:
+    uint64_t initAddr;
+    uint32_t initValue;
+
+  public:
+    SSSPWorkload(uint64_t init_addr, uint32_t init_value):
+        initAddr(init_addr), initValue(init_value)
+    {}
+
+    ~SSSPWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
 
 class PRWorkload : public GraphWorkload
 {
@@ -98,6 +140,22 @@ class PRWorkload : public GraphWorkload
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
+class CCWorkload : public GraphWorkload
+{
+
+  public:
+    CCWorkload() {}
+
+    ~CCWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
 }
 
 #endif // __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__

From 000103e41a94e4baf407eca22e44c3aabb0fe972 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 11 Nov 2022 14:40:50 -0800
Subject: [PATCH 221/247] Adding option to use SimpleMemory for vertex memory.

---
 configs/accl/bfs.py                           |  17 ++-
 configs/accl/pr.py                            |  20 ++-
 configs/accl/real-graph-gen.py                |  16 ++-
 configs/accl/sega.py                          |  34 ++---
 .../accl/{sega-simple.py => sega_simple.py}   | 133 ++++++++----------
 5 files changed, 113 insertions(+), 107 deletions(-)
 rename configs/accl/{sega-simple.py => sega_simple.py} (50%)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 829449c599..806aa8a915 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -24,7 +24,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from sega import SEGA
 
 import m5
 import argparse
@@ -39,6 +38,14 @@ def get_inputs():
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -64,6 +71,7 @@ def get_inputs():
         args.graph,
         args.init_addr,
         args.init_value,
+        args.simple,
         args.sample,
         args.verify,
     )
@@ -76,10 +84,15 @@ def get_inputs():
         graph,
         init_addr,
         init_value,
+        simple,
         sample,
         verify,
     ) = get_inputs()
-
+    
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
 
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index e852e47561..e3d7c764ad 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -24,7 +24,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from sega import SEGA
 
 import m5
 import argparse
@@ -39,6 +38,14 @@ def get_inputs():
     argparser.add_argument("graph", type=str)
     argparser.add_argument("alpha", type=float)
     argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -64,10 +71,12 @@ def get_inputs():
         args.graph,
         args.alpha,
         args.threshold,
+        args.simple,
         args.sample,
         args.verify,
     )
 
+
 if __name__ == "__m5_main__":
     (
         num_gpts,
@@ -75,10 +84,15 @@ def get_inputs():
         graph,
         alpha,
         threshold,
+        simple,
         sample,
         verify,
     ) = get_inputs()
-
+    
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
 
@@ -95,7 +109,6 @@ def get_inputs():
             )
             m5.stats.dump()
             m5.stats.reset()
-            print(exit_event.getCause())
             if exit_event.getCause() != "simulate() limit reached":
                 break
     else:
@@ -106,3 +119,4 @@ def get_inputs():
         )
     if verify:
         system.print_answer()
+
diff --git a/configs/accl/real-graph-gen.py b/configs/accl/real-graph-gen.py
index b943a925c1..332bb67452 100644
--- a/configs/accl/real-graph-gen.py
+++ b/configs/accl/real-graph-gen.py
@@ -45,8 +45,11 @@ def get_inputs():
 if __name__ == "__main__":
     graph_path, num_gpts = get_inputs()
 
+    graph_sorter = os.environ.get("GRAPH_SORTER")
     graph_reader = os.environ.get("GRAPH_READER")
 
+    if graph_sorter is None:
+        raise ValueError(f"No value for $GRAPH_SORTER.")
     if graph_reader is None:
         raise ValueError(f"No value for $GRAPH_READER.")
 
@@ -54,6 +57,17 @@ def get_inputs():
         raise ValueError(f"{graph_path} does not exist.")
 
     graph_dir = os.path.dirname(graph_path)
+    sorted_graph = f"{graph_dir}/sorted_graph.txt"
+    if not os.path.exists(sorted_graph):
+        print(f"Sorting {graph_path} into {sorted_graph}.")
+        subprocess.run(
+            [
+                "python",
+                f"{graph_sorter}",
+                f"{graph_path}",
+                f"{sorted_graph}",
+            ]
+        )
     if not "binaries" in os.listdir(graph_dir):
         print(f"binaries directory not found in {graph_dir}")
         os.mkdir(f"{graph_dir}/binaries")
@@ -80,7 +94,7 @@ def get_inputs():
         subprocess.run(
             [
                 f"{graph_reader}",
-                f"{graph_path}",
+                f"{sorted_graph}",
                 "false",
                 f"{num_gpts}",
                 "32",
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7831302228..1ea36ea49e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -48,11 +48,9 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 
 class GPT(SubSystem):
     def __init__(
-        self, edge_memory_size: str, cache_size: str, simple_mem: bool = False
-    ):
+        self, edge_memory_size: str, cache_size: str):
         super().__init__()
-        self._simple_mem = simple_mem
-        self.wl_engine = WLEngine(update_queue_size=128, register_file_size=64)
+        self.wl_engine = WLEngine(update_queue_size=64, register_file_size=64)
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
             cache_size=cache_size,
@@ -69,20 +67,14 @@ def __init__(
             update_queue_size=32,
         )
 
-        if self._simple_mem:
-            self.vertex_mem_ctrl = SimpleMemory(
-                latency="122ns", latency_var="0ns", bandwidth="28GiB/s"
-            )
-        else:
-            self.vertex_mem_ctrl = HBMCtrl(
-                dram=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96),
-                dram_2=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96)
-            )
+        self.vertex_mem_ctrl = HBMCtrl(
+            dram=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96),
+            dram_2=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96)
+        )
 
         self.edge_mem_ctrl = MemCtrl(
             dram=DDR4_2400_8x8(
-                range=AddrRange(edge_memory_size), in_addr_map=False
-            )
+                range=AddrRange(edge_memory_size), in_addr_map=False)
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
@@ -106,17 +98,11 @@ def setReqPort(self, port):
         self.push_engine.out_ports = port
 
     def set_vertex_range(self, vertex_ranges):
-        if self._simple_mem:
-            self.vertex_mem_ctrl.range = vertex_ranges[0]
-        else:
-            self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
-            self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
 
     def set_vertex_pch_bit(self, pch_bit):
-        if self._simple_mem:
-            pass
-        else:
-            self.vertex_mem_ctrl.pch_bit = pch_bit
+        self.vertex_mem_ctrl.pch_bit = pch_bit
 
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
diff --git a/configs/accl/sega-simple.py b/configs/accl/sega_simple.py
similarity index 50%
rename from configs/accl/sega-simple.py
rename to configs/accl/sega_simple.py
index 7ec19c92ae..f59fa71a79 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega_simple.py
@@ -24,90 +24,87 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import m5
-import argparse
-
 from math import log
 from m5.objects import *
 
+
 def interleave_addresses(plain_range, num_channels, cache_line_size):
     intlv_low_bit = log(cache_line_size, 2)
     intlv_bits = log(num_channels, 2)
     ret = []
     for i in range(num_channels):
-        ret.append(AddrRange(
-            start=plain_range.start,
-            size=plain_range.size(),
-            intlvHighBit=intlv_low_bit + intlv_bits - 1,
-            xorHighBit=0,
-            intlvBits=intlv_bits,
-            intlvMatch=i))
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
     return ret
 
+
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str, cache_size: str):
+    def __init__(
+        self, edge_memory_size: str, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(
-                                update_queue_size=128,
-                                register_file_size=64
-                                )
+        self.wl_engine = WLEngine(update_queue_size=64, register_file_size=64)
         self.coalesce_engine = CoalesceEngine(
-                                            attached_memory_atom_size=32,
-                                            cache_size=cache_size,
-                                            num_mshr_entry=64,
-                                            num_tgts_per_mshr=64,
-                                            max_resp_per_cycle=8
-                                            )
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=32,
+            active_buffer_size=64,
+            post_push_wb_queue_size=64,
+        )
         self.push_engine = PushEngine(
-                                    push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=64,
-                                    update_queue_size=32,
-                                    )
-
-        self.vertex_mem_ctrl = SimpleMemory(
-                                        latency="0ns",
-                                        latency_var="0ns",
-                                        bandwidth="0GB/s"
-                                        )
-
-        self.edge_mem_ctrl = SimpleMemory(
-                                        latency="30ns",
-                                        latency_var="0ns",
-                                        bandwidth="32GB/s",
-                                        range=AddrRange(edge_memory_size),
-                                        in_addr_map=False
-                                        )
-
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=4096,
+            max_propagates_per_cycle=8,
+            update_queue_size=32,
+        )
+        
+        self.vertex_mem_ctrl = SimpleMemory(latency="122ns", latency_var="0ns", bandwidth="28GiB/s")
+        
+        self.edge_mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(
+                range=AddrRange(edge_memory_size), in_addr_map=False)
+        )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
 
         self.mpu = MPU(
-                    wl_engine=self.wl_engine,
-                    coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
-                    )
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
 
     def getRespPort(self):
         return self.wl_engine.in_ports
+
     def setRespPort(self, port):
         self.wl_engine.in_ports = port
 
     def getReqPort(self):
         return self.push_engine.out_ports
+
     def setReqPort(self, port):
         self.push_engine.out_ports = port
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.range = vertex_range
+
     def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.image_file = edge_image
+        self.edge_mem_ctrl.dram.image_file = edge_image
 
 class SEGA(System):
     def __init__(self, num_mpus, cache_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
+        self.clk_domain.clock = "2GHz"
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
@@ -115,14 +112,12 @@ def __init__(self, num_mpus, cache_size, graph_path):
         self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
 
         vertex_ranges = interleave_addresses(
-                                        AddrRange(start=0, size="4GiB"),
-                                        num_mpus,
-                                        32
-                                        )
+            AddrRange(start=0, size="4GiB"), num_mpus, 32
+        )
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("8GiB", cache_size)
+            gpt = GPT("4GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpts.append(gpt)
@@ -134,32 +129,16 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
-    def create_initial_bfs_update(self, init_addr, init_value):
-        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-
-    args = argparser.parse_args()
-
-    return args.num_gpts, args.cache_size, \
-        args.graph, args.init_addr, args.init_value
-
-if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
+    def create_pop_count_directory(self, atoms_per_block):
+        for gpt in self.gpts:
+            gpt.coalesce_engine.createPopCountDirectory(atoms_per_block)
 
-    system = SEGA(num_gpts, cache_size, graph)
-    root = Root(full_system = False, system = system)
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.createBFSWorkload(init_addr, init_value)
 
-    m5.instantiate()
+    def create_pr_workload(self, alpha, threshold):
+        self.ctrl.createPRWorkload(alpha, threshold)
 
-    system.create_initial_bfs_update(init_addr, init_value)
+    def print_answer(self):
+        self.ctrl.printAnswerToHostSimout()
 
-    exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} " + \
-            f"because {exit_event.getCause()}")

From 4b30d61b3a7b5261973467c478d2243da896d83b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 12:55:54 -0800
Subject: [PATCH 222/247] Removing graph gen scripts and moved to sega-utils.

---
 configs/accl/real-graph-gen.py  | 107 ------------------------
 configs/accl/synth-graph-gen.py | 139 --------------------------------
 2 files changed, 246 deletions(-)
 delete mode 100644 configs/accl/real-graph-gen.py
 delete mode 100644 configs/accl/synth-graph-gen.py

diff --git a/configs/accl/real-graph-gen.py b/configs/accl/real-graph-gen.py
deleted file mode 100644
index 332bb67452..0000000000
--- a/configs/accl/real-graph-gen.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os
-import argparse
-import subprocess
-
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("path", type=str, help="Path to the graph file.")
-    argparser.add_argument(
-        "num_gpts",
-        type=int,
-        help="Number gpts to create synth graph binaries for.",
-    )
-
-    args = argparser.parse_args()
-    return args.path, args.num_gpts
-
-
-if __name__ == "__main__":
-    graph_path, num_gpts = get_inputs()
-
-    graph_sorter = os.environ.get("GRAPH_SORTER")
-    graph_reader = os.environ.get("GRAPH_READER")
-
-    if graph_sorter is None:
-        raise ValueError(f"No value for $GRAPH_SORTER.")
-    if graph_reader is None:
-        raise ValueError(f"No value for $GRAPH_READER.")
-
-    if not os.path.exists(graph_path):
-        raise ValueError(f"{graph_path} does not exist.")
-
-    graph_dir = os.path.dirname(graph_path)
-    sorted_graph = f"{graph_dir}/sorted_graph.txt"
-    if not os.path.exists(sorted_graph):
-        print(f"Sorting {graph_path} into {sorted_graph}.")
-        subprocess.run(
-            [
-                "python",
-                f"{graph_sorter}",
-                f"{graph_path}",
-                f"{sorted_graph}",
-            ]
-        )
-    if not "binaries" in os.listdir(graph_dir):
-        print(f"binaries directory not found in {graph_dir}")
-        os.mkdir(f"{graph_dir}/binaries")
-        print(f"Created {graph_dir}/binaries")
-
-    if not f"gpts_{num_gpts}" in os.listdir(f"{graph_dir}/binaries"):
-        print(f"gpts_{num_gpts} not found in {graph_dir}/binaries")
-        os.mkdir(f"{graph_dir}/binaries/gpts_{num_gpts}")
-        print(f"Created {graph_dir}/binaries/gpts_{num_gpts}")
-
-    expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
-    if not all(
-        [
-            binary in os.listdir(f"{graph_dir}/binaries/gpts_{num_gpts}")
-            for binary in expected_bins
-        ]
-    ):
-        print(
-            f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}"
-        )
-        for delete in os.scandir(f"{graph_dir}/binaries/gpts_{num_gpts}"):
-            os.remove(delete.path)
-        print(f"Deleted all the files in {graph_dir}/binaries/gpts_{num_gpts}")
-        subprocess.run(
-            [
-                f"{graph_reader}",
-                f"{sorted_graph}",
-                "false",
-                f"{num_gpts}",
-                "32",
-                f"{graph_dir}/binaries/gpts_{num_gpts}",
-            ]
-        )
-        print(
-            f"Created the graph binaries in "
-            f"{graph_dir}/binaries/gpts_{num_gpts}"
-        )
diff --git a/configs/accl/synth-graph-gen.py b/configs/accl/synth-graph-gen.py
deleted file mode 100644
index 15e4a6eff2..0000000000
--- a/configs/accl/synth-graph-gen.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os
-import argparse
-import subprocess
-
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument(
-        "scale", type=int, help="The scale of the synth graph to generate."
-    )
-    argparser.add_argument(
-        "deg",
-        type=int,
-        help="The average degree of the synth graph to generate.",
-    )
-    argparser.add_argument(
-        "num_gpts",
-        type=int,
-        help="Number gpts to create synth graph binaries for.",
-    )
-
-    args = argparser.parse_args()
-    return args.scale, args.deg, args.num_gpts
-
-
-if __name__ == "__main__":
-    scale, deg, num_gpts = get_inputs()
-
-    base_dir = os.environ.get("GRAPH_DIR", default="/tmp")
-    graph_gen = os.environ.get("GRAPH_GEN")
-    graph_reader = os.environ.get("GRAPH_READER")
-    graph_sorter = os.environ.get("GRAPH_SORTER")
-    if graph_gen is None:
-        raise ValueError(f"No value for $GRAPH_GEN.")
-    if graph_reader is None:
-        raise ValueError(f"No value for $GRAPH_READER.")
-    if graph_sorter is None:
-        raise ValueError(f"No value for $GRAPH_SORTER")
-
-    graph_path = os.path.join(base_dir, f"graph_{scale}_{deg}")
-    if not os.path.exists(graph_path):
-        print(f"{graph_path} does not exist already.")
-        os.mkdir(graph_path)
-        print(f"Created {graph_path}")
-
-    if not "graph.txt" in os.listdir(graph_path):
-        print(f"graph.txt not found in {graph_path}")
-        for delete in os.scandir(graph_path):
-            os.remove(delete.path)
-        print(f"Deleted everything in {graph_path}")
-        subprocess.run(
-            [
-                f"{graph_gen}",
-                f"{scale}",
-                f"{deg}",
-                f"{graph_path}/graph_unordered.txt",
-            ]
-        )
-        print(f"Generated a graph with scale " f"{scale} and deg {deg}")
-        subprocess.run(
-            [
-                "python",
-                f"{graph_sorter}",
-                f"{graph_path}/graph_unordered.txt",
-                f"{graph_path}/graph.txt",
-            ]
-        )
-        print(
-            f"Sorted the graph here {graph_path}/graph_unordered.txt"
-            f" and saved in {graph_path}/graph.txt"
-        )
-        subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"])
-        print(f"Deleted {graph_path}/graph_unordered.txt")
-
-    if not "binaries" in os.listdir(graph_path):
-        print(f"binaries directory not found in {graph_path}")
-        os.mkdir(f"{graph_path}/binaries")
-        print(f"Created {graph_path}/binaries")
-
-    if not f"gpts_{num_gpts}" in os.listdir(f"{graph_path}/binaries"):
-        print(f"gpts_{num_gpts} not found in {graph_path}/binaries")
-        os.mkdir(f"{graph_path}/binaries/gpts_{num_gpts}")
-        print(f"Created {graph_path}/binaries/gpts_{num_gpts}")
-
-    expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
-    if not all(
-        [
-            binary in os.listdir(f"{graph_path}/binaries/gpts_{num_gpts}")
-            for binary in expected_bins
-        ]
-    ):
-        print(
-            f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}"
-        )
-        for delete in os.scandir(f"{graph_path}/binaries/gpts_{num_gpts}"):
-            os.remove(delete.path)
-        print(
-            f"Deleted all the files in {graph_path}/binaries/gpts_{num_gpts}"
-        )
-        subprocess.run(
-            [
-                f"{graph_reader}",
-                f"{graph_path}/graph.txt",
-                "false",
-                f"{num_gpts}",
-                "32",
-                f"{graph_path}/binaries/gpts_{num_gpts}",
-            ]
-        )
-        print(
-            f"Created the graph binaries in "
-            f"{graph_path}/binaries/gpts_{num_gpts}"
-        )

From e10ce6142d0a7e255121d14a2eefe2715756bc1c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 12:57:19 -0800
Subject: [PATCH 223/247] Adding BSP mode.

---
 src/accl/graph/base/data_structs.hh        |  30 ++-
 src/accl/graph/base/graph_workload.hh      |   2 +-
 src/accl/graph/sega/CenteralController.py  |   3 +
 src/accl/graph/sega/CoalesceEngine.py      |   3 -
 src/accl/graph/sega/centeral_controller.cc |  66 ++++--
 src/accl/graph/sega/centeral_controller.hh |  10 +-
 src/accl/graph/sega/coalesce_engine.cc     | 257 ++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh     |  17 +-
 src/accl/graph/sega/enums.cc               |  15 +-
 src/accl/graph/sega/enums.hh               |  18 ++
 src/accl/graph/sega/mpu.hh                 |   4 +
 11 files changed, 308 insertions(+), 117 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 84233ae39c..f09a0dd167 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -43,28 +43,34 @@ struct __attribute__ ((packed)) WorkListItem
 {
     uint32_t tempProp : 32;
     uint32_t prop : 32;
-    uint32_t degree : 32;
     uint32_t edgeIndex : 32;
+    uint32_t degree : 30;
+    bool activeNow: 1;
+    bool activeFuture: 1;
 
     std::string to_string()
     {
         return csprintf("WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, "
-                            "degree: %u}", tempProp, prop, edgeIndex, degree);
+                        "degree: %u, activeNow: %s, activeFuture: %s}",
+                        tempProp, prop, edgeIndex, degree,
+                        activeNow ? "true" : "false",
+                        activeFuture ? "true" : "false");
     }
 
     WorkListItem():
         tempProp(0),
         prop(0),
+        edgeIndex(0),
         degree(0),
-        edgeIndex(0)
+        activeNow(false),
+        activeFuture(false)
     {}
 
     WorkListItem(uint32_t temp_prop, uint32_t prop,
-                uint32_t degree, uint32_t edge_index):
-        tempProp(temp_prop),
-        prop(prop),
-        degree(degree),
-        edgeIndex(edge_index)
+                uint32_t degree, uint32_t edge_index,
+                bool active_now, bool active_future):
+        tempProp(temp_prop), prop(prop), edgeIndex(edge_index), degree(degree),
+        activeNow(active_now), activeFuture(active_future)
     {}
 
 };
@@ -158,6 +164,10 @@ class UniqueFIFO
         return fifo.size();
     }
 
+    void clear() {
+        fifo.clear();
+    }
+
     bool empty() {
         return fifo.empty();
     }
@@ -174,6 +184,10 @@ class UniqueFIFO
         assert(it != fifo.end());
         fifo.erase(it);
     }
+
+    void operator=(const UniqueFIFO<T>& rhs) {
+        fifo = rhs.fifo;
+    }
 };
 
 }
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index de2877d6e8..14a6561ae3 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -51,7 +51,7 @@ class GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual uint32_t apply(WorkListItem& wl) = 0;
-    virtual bool activeCondition(WorkListItem wl) = 0;
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0;
     virtual std::string printWorkListItem(const WorkListItem wl) = 0;
 };
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 09a997696d..8b43c90102 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -42,6 +42,9 @@ class CenteralController(ClockedObject):
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
     cxx_exports = [
+                    PyBindMethod("setAsyncMode"),
+                    PyBindMethod("setBSPMode"),
+                    PyBindMethod("createPopCountDirectory"),
                     PyBindMethod("createBFSWorkload"),
                     PyBindMethod("createPRWorkload"),
                     PyBindMethod("printAnswerToHostSimout")
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index c2393c2f1e..25f8a1c58b 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -27,7 +27,6 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.util.pybind import PyBindMethod
 from m5.objects.BaseMemoryEngine import BaseMemoryEngine
 
 class CoalesceEngine(BaseMemoryEngine):
@@ -48,5 +47,3 @@ class CoalesceEngine(BaseMemoryEngine):
                                 "apply process for applications that require "
                                 "the apply process to happen exactly before "
                                 "pushing the edgePointer to the PushEngine.")
-
-    cxx_exports = [PyBindMethod("createPopCountDirectory")]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 60c78559e4..6c924a4703 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -42,7 +42,9 @@ namespace gem5
 
 CenteralController::CenteralController(const Params& params):
     ClockedObject(params),
-    system(params.system)
+    system(params.system),
+    mode(ProcessingMode::NOT_SET),
+    state(BulkSynchronousState::NOT_SET)
 {
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
@@ -50,11 +52,41 @@ CenteralController::CenteralController(const Params& params):
     }
 }
 
+void
+CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BFSWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createPRWorkload(float alpha, float threshold)
+{
+    workload = new PRWorkload(alpha, threshold);
+}
+
+void
+CenteralController::createPopCountDirectory(int atoms_per_block)
+{
+    fatal_if(mode == ProcessingMode::NOT_SET, "You should set the processing "
+                        "mode by calling either setAsyncMode or setBSPMode.")
+    if (mode == ProcessingMode::ASYNCHRONOUS) {
+        for (auto mpu: mpuVector) {
+            mpu->createAsyncPopCountDirectory(atoms_per_block);
+        }
+    }
+    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        for (auto mpu: mpuVector) {
+            mpu->createBSPPopCountDirectory(atoms_per_block);
+        }
+    }
+}
+
 void
 CenteralController::startup()
 {
     for (auto mpu: mpuVector) {
         addrRangeListMap[mpu] = mpu->getAddrRanges();
+        mpu->setProcessingMode(mode);
         mpu->recvWorkload(workload);
     }
 
@@ -83,7 +115,7 @@ CenteralController::startup()
 
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
-        if (!mpu->running() && (mpu->workCount()> 0)) {
+        if (!mpu->running() && (mpu->workCount() > 0)) {
             mpu->start();
         }
     }
@@ -104,18 +136,6 @@ CenteralController::createReadPacket(Addr addr, unsigned int size)
     return pkt;
 }
 
-void
-CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
-{
-    workload = new BFSWorkload(init_addr, init_value);
-}
-
-void
-CenteralController::createPRWorkload(float alpha, float threshold)
-{
-    workload = new PRWorkload(alpha, threshold);
-}
-
 void
 CenteralController::recvDoneSignal()
 {
@@ -124,9 +144,25 @@ CenteralController::recvDoneSignal()
         done &= mpu->done();
     }
 
-    if (done) {
+    if (done && mode == ProcessingMode::ASYNCHRONOUS) {
         exitSimLoopNow("no update left to process.");
     }
+
+    if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        assert(state != BulkSynchronousState::DONT_CARE);
+        if (state == BulkSynchronousState::APPLYING) {
+            // TODO:
+            // 1- Toggle directories
+            // 2- Check if termination condition is met
+            // 3- If yes, schedule exit event,
+            // 4- If not switch state to consuming.
+            exitSimLoopNow("applying done.");
+        } else if (state == BulkSynchronousState::CONSUMING) {
+            // TODO:
+            // Schedule Bulk apply
+            exitSimLoopNow("consuming done.");
+        }
+    }
 }
 
 void
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index ae2980d050..ab0e0c0c09 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -33,6 +33,7 @@
 
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/enums.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "base/addr_range.hh"
 #include "params/CenteralController.hh"
@@ -46,9 +47,11 @@ class CenteralController : public ClockedObject
 {
   private:
     System* system;
-
     Addr maxVertexAddr;
 
+    ProcessingMode mode;
+    BulkSynchronousState state;
+
     std::vector<MPU*> mpuVector;
     std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
 
@@ -62,6 +65,11 @@ class CenteralController : public ClockedObject
     CenteralController(const CenteralControllerParams &params);
     virtual void startup() override;
 
+    void setAsyncMode() { mode = ProcessingMode::ASYNCHRONOUS; }
+    void setBSPMode() { mode = ProcessingMode::BULK_SYNCHRONOUS; }
+
+    void createPopCountDirectory(int atoms_per_block);
+
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
     void createPRWorkload(float alpha, float threshold);
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 8ac40198be..bfe3fe21b8 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -34,7 +34,6 @@
 #include "base/intmath.hh"
 #include "debug/CacheBlockState.hh"
 #include "debug/CoalesceEngine.hh"
-#include "debug/MSDebug.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
@@ -43,7 +42,7 @@ namespace gem5
 {
 
 CoalesceEngine::CoalesceEngine(const Params &params):
-    BaseMemoryEngine(params), lastAtomAddr(0),
+    BaseMemoryEngine(params), mode(ProcessingMode::NOT_SET), lastAtomAddr(0),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle),
@@ -77,6 +76,8 @@ CoalesceEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
+
+// NOTE: Used for initializing memory and reading the final answer
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
 {
@@ -85,10 +86,6 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
         Addr addr = pkt->getAddr();
         int block_index = getBlockIndex(addr);
 
-        // FIXME: Check postPushWBQueue for hits
-        // Is it really the case though. I don't think at this time
-        // beacuse we check done after handleMemResp and make sure all
-        // the writes to memory are done before scheduling an exit event
         if ((cacheBlocks[block_index].addr == addr) &&
             (cacheBlocks[block_index].valid)) {
             assert(cacheBlocks[block_index].state == CacheState::IDLE);
@@ -100,7 +97,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
             memPort.sendFunctional(pkt);
         }
     } else {
-        graphWorkload->init(pkt, directory);
+        graphWorkload->init(pkt, currentDirectory);
         if (pkt->getAddr() > lastAtomAddr) {
             lastAtomAddr = pkt->getAddr();
         }
@@ -111,21 +108,46 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
 void
 CoalesceEngine::postMemInitSetup()
 {
-    directory->setLastAtomAddr(lastAtomAddr);
+    currentDirectory->setLastAtomAddr(lastAtomAddr);
 }
 
 void
-CoalesceEngine::createPopCountDirectory(int atoms_per_block)
+CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block)
 {
-    directory = new PopCountDirectory(
+    currentDirectory = new PopCountDirectory(
                         peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+    futureDirectory = nullptr;
+}
+
+void
+CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block)
+{
+    currentDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+    futureDirectory = new PopCountDirectroy(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+}
+
+void
+CoalesceEngine::swapDirectories()
+{
+    assert(currentDirectory->empty());
+    assert(currentActiveCacheBlocks.empty());
+    // assert currentDirectory is empty
+    WorkDirectory* temp = currentDirectory;
+    currentDirectory = futureDirectory;
+    futureDirectory = temp;
+
+    currentActiveCacheBlocks.clear();
+    currentActiveCacheBlocks = futureActiveCacheBlocks;
+    futureActiveCacheBlocks.clear();
 }
 
 bool
 CoalesceEngine::done()
 {
-    return memoryFunctionQueue.empty() && activeCacheBlocks.empty() &&
-        activeBuffer.empty() && directory->empty() && (onTheFlyReqs == 0);
+    return memoryFunctionQueue.empty() && currentActiveCacheBlocks.empty() &&
+        activeBuffer.empty() && currentDirectory->empty() && (onTheFlyReqs == 0);
 }
 
 bool
@@ -249,16 +271,21 @@ CoalesceEngine::recvWLRead(Addr addr)
                     // NOTE: The cache block could still be active but
                     // not dirty. If active we only have to active tracking
                     // but can throw the data away.
-                    bool atom_active = false;
+                    bool atom_active_now = false;
+                    bool atom_active_future = false;
                     for (int index = 0; index < numElementsPerLine; index++) {
-                        atom_active |= graphWorkload->activeCondition(
-                                        cacheBlocks[block_index].items[index]);
+                        atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                        atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
                     }
-                    if (atom_active) {
-                        activeCacheBlocks.erase(block_index);
-                        int count = directory->activate(cacheBlocks[block_index].addr);
-                        stats.blockActiveCount.sample(count);
-                        stats.frontierSize.sample(directory->workCount());
+                    if (atom_active_now) {
+                        currentActiveCacheBlocks.erase(block_index);
+                        int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                        // stats.blockActiveCount.sample(count);
+                        // stats.frontierSize.sample(directory->workCount());
+                    }
+                    if (atom_active_future) {
+                        futureActiveCacheBlocks.erase(block_index);
+                        int count = futureDirectory->activate(cacheBlocks[block_index].addr);
                     }
                     // NOTE: Bring the cache line to invalid state.
                     // NOTE: Above line where we set hasConflict to true
@@ -360,16 +387,21 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // Since it is going to the cache, cache will be responsible for
             // tracking this. Push to activeCacheBlocks for simulator speed
             // instead of having to search for active blocks in the cache.
-            bool atom_active = false;
+            bool atom_active_now = false;
+            bool atom_active_future = false;
             for (int index = 0; index < numElementsPerLine; index++) {
-                atom_active |= graphWorkload->activeCondition(
-                                            cacheBlocks[block_index].items[index]);
+                atom_active_now |= cacheBlocks[block_inde].items[index].activeNow;
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_now) {
+                // TODO: Add sampling of blockActiveCount and frontierSize here
+                int count = currentDirectory->deactivate(addr);
+                currentActiveCacheBlocks.push_back(block_index);
             }
-            if (atom_active) {
-                int count = directory->deactivate(addr);
-                activeCacheBlocks.push_back(block_index);
-                stats.blockActiveCount.sample(count);
-                stats.frontierSize.sample(directory->workCount());
+            if (atom_active_future) {
+                // TODO: Add sampling of blockActiveCount and frontierSize here
+                int count = futureDirectory->deactivate(addr);
+                futureActiveCacheBlocks.push_back(block_index);
             }
 
             assert(MSHR.find(block_index) != MSHR.end());
@@ -420,15 +452,16 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
             WorkListItem items[numElementsPerLine];
             pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
-            bool atom_active = false;
+            bool atom_active_now = false;
             for (int index = 0; index < numElementsPerLine; index++) {
-                atom_active |= graphWorkload->activeCondition(items[index]);
+                atom_active |= items[index].activeNow;
             }
-            if (atom_active) {
-                int count = directory->deactivate(addr);
+            if (atom_active_now) {
+                // TODO: Add sampling of blockActiveCount and frontierSize here
+                int count = currentDirectory->deactivate(addr);
                 activeBuffer.emplace_back(pkt, curTick());
-                stats.blockActiveCount.sample(count);
-                stats.frontierSize.sample(directory->workCount());
+                // stats.blockActiveCount.sample(count);
+                // stats.frontierSize.sample(directory->workCount());
             } else {
                 delete pkt;
             }
@@ -486,6 +519,9 @@ CoalesceEngine::processNextResponseEvent()
         stats.responseQueueLatency.sample(
                                     waiting_ticks * 1e9 / getClockFrequency());
         if (num_responses_sent >= maxRespPerCycle) {
+            // TODO: Add the condition to check that front of queue can be
+            // sent to WLEngine. i.e. it has at least been in the queue for
+            // one cycle.
             if (!responseQueue.empty()) {
                 stats.responsePortShortage++;
             }
@@ -533,12 +569,22 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
         cacheBlocks[block_index].dirty |= true;
     }
+
+    bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]);
     cacheBlocks[block_index].items[wl_offset] = wl;
-    if ((graphWorkload->activeCondition(cacheBlocks[block_index].items[wl_offset])) &&
-        (!activeCacheBlocks.find(block_index))) {
-        activeCacheBlocks.push_back(block_index);
-        if (!owner->running()) {
-            owner->start();
+    if (mode == ProcessingMode::ASYNCHRONOUS) {
+        cacheBlocks[block_index].activeNow |= active;
+        if (active && (!currentActiveCacheBlocks.find(block_index))) {
+            currentActiveCacheBlocks.push_back(block_index);
+            if (!owner->running()) {
+                owner->start();
+            }
+        }
+    }
+    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        cacheBlocks[block_index].activeFuture |= active;
+        if (active && (!futureActiveCacheBlocks.find(block_index))) {
+            futureActiveCacheBlocks.push_back(block_index);
         }
     }
 
@@ -565,16 +611,22 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     schedule(nextMemoryEvent, nextCycle());
                 }
             } else {
-                bool atom_active = false;
+                bool atom_active_now = false;
+                bool atom_active_future = false;
                 for (int index = 0; index < numElementsPerLine; index++) {
-                    atom_active |= graphWorkload->activeCondition(
-                                        cacheBlocks[block_index].items[index]);
+                    atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                    atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
                 }
-                if (atom_active) {
-                    activeCacheBlocks.erase(block_index);
-                    int count = directory->activate(cacheBlocks[block_index].addr);
-                    stats.blockActiveCount.sample(count);
-                    stats.frontierSize.sample(directory->workCount());
+                if (atom_active_now) {
+                    // TODO: Sample frontier size and blockCount here.
+                    currentActiveCacheBlocks.erase(block_index);
+                    int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                    // stats.blockActiveCount.sample(count);
+                    // stats.frontierSize.sample(directory->workCount());
+                }
+                if (atom_active_future) {
+                    futureActiveCacheBlocks.erase(block_index);
+                    int count = futureDirectory->activate(cacheBlocks[block_index].addr);
                 }
                 cacheBlocks[block_index].reset();
             }
@@ -586,6 +638,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
     stats.numVertexWrites++;
+
     if ((cacheBlocks[block_index].state == CacheState::IDLE) && done()) {
         owner->recvDoneSignal();
     }
@@ -623,6 +676,8 @@ CoalesceEngine::processNextMemoryEvent()
         schedule(nextMemoryEvent, nextCycle());
     }
 
+    // FIXME: done() might have a different meaning depending on
+    // ProcessingMode and Processing state
     if (done()) {
         owner->recvDoneSignal();
     }
@@ -659,6 +714,16 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             cacheBlocks[block_index].valid = true;
             cacheBlocks[block_index].dirty = true;
             cacheBlocks[block_index].lastChangedTick = curTick();
+            // NOTE: If an atom is in the postPushWBQueue,
+            // the it is definitely currently not active.
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++)
+            {
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
 
             need_send_pkt = false;
             wb = postPushWBQueue.erase(wb);
@@ -677,7 +742,19 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             cacheBlocks[block_index].valid = true;
             cacheBlocks[block_index].dirty = true;
             cacheBlocks[block_index].lastChangedTick = curTick();
-            activeCacheBlocks.push_back(block_index);
+            // If an atom is in the activeBuffer,
+            // then it is definitely currently active.
+            currentActiveCacheBlocks.push_back(block_index);
+            // NOTE: Residence in the activeBuffer does not
+            // signify anything about future activity.
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++)
+            {
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
 
             need_send_pkt = false;
             ab = activeBuffer.erase(ab);
@@ -767,10 +844,11 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
 
         // NOTE: If the atom we're writing back is active, we have to
         // stop tracking it in the cache and start tracking it in the memory.
-        bool atom_active = false;
+        bool atom_active_now = false;
+        bool atom_active_future = false;
         for (int index = 0; index < numElementsPerLine; index++) {
-            atom_active |= graphWorkload->activeCondition(
-                                        cacheBlocks[block_index].items[index]);
+            atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+            atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
         }
 
         PacketPtr pkt = createWritePacket(
@@ -779,18 +857,25 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
         DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
                         "Addr: %lu, size = %d.\n", __func__,
                         pkt->getAddr(), pkt->getSize());
-        if (atom_active) {
-            activeCacheBlocks.erase(block_index);
+        if (atom_active_future) {
+            futureActiveCacheBlocks.erase(block_index);
+        }
+        if (atom_active_now) {
+            currentActiveCacheBlocks.erase(block_index);
             if (enoughSpace()) {
                 activeBuffer.emplace_back(pkt, curTick());
             } else {
-                int count = directory->activate(cacheBlocks[block_index].addr);
-                stats.blockActiveCount.sample(count);
-                stats.frontierSize.sample(directory->workCount());
+                int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                int count_2 = futureDirectory->activate(cacheBlocks[block_index].addr);
+                // stats.blockActiveCount.sample(count);
+                // stats.frontierSize.sample(directory->workCount());
                 memPort.sendPacket(pkt);
                 onTheFlyReqs++;
             }
         } else {
+            if (atom_active_future) {
+                int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+            }
             memPort.sendPacket(pkt);
             onTheFlyReqs++;
         }
@@ -810,17 +895,24 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
 void
 CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
 {
-    if (postPushWBQueue.empty()) {
-        return;
-    }
-
-    PacketPtr wb_pkt;
-    Tick pkt_tick;
-    std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
-    if (schedule_tick == pkt_tick) {
-        memPort.sendPacket(wb_pkt);
-        onTheFlyReqs++;
-        postPushWBQueue.pop_front();
+    if (!postPushWBQueue.empty()) {
+        PacketPtr wb_pkt;
+        Tick pkt_tick;
+        std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
+        if (schedule_tick == pkt_tick) {
+            WorkListItem items[numElementsPerLine];
+            wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementPerLine; index++) {
+                atom_active_future |= items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureDirectory->activate(wb_pkt->getAddr());
+            }
+            memPort.sendPacket(wb_pkt);
+            onTheFlyReqs++;
+            postPushWBQueue.pop_front();
+        }
     }
 }
 
@@ -828,8 +920,8 @@ void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
     pullsScheduled--;
-    if (!directory->empty()) {
-        Addr addr = directory->getNextWork();
+    if (!currentDirectory->empty()) {
+        Addr addr = currentDirectory->getNextWork();
         int block_index = getBlockIndex(addr);
 
         bool in_cache = cacheBlocks[block_index].addr == addr;
@@ -875,8 +967,7 @@ CoalesceEngine::recvMemRetry()
 int
 CoalesceEngine::workCount()
 {
-    return activeCacheBlocks.size() +
-            directory->workCount() + activeBuffer.size();
+    return activeCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size();
 }
 
 void
@@ -905,9 +996,10 @@ CoalesceEngine::processNextApplyEvent()
         pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
 
         for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
-            if (graphWorkload->activeCondition(items[index])) {
+            if (items[index].activeNow) {
                 Addr addr = pkt->getAddr() + index * sizeof(WorkListItem);
                 uint32_t delta = graphWorkload->apply(items[index]);
+                items[index].activeNow = false;
                 owner->recvVertexPush(addr, delta, items[index].edgeIndex,
                                                     items[index].degree);
                 pullsReceived--;
@@ -919,12 +1011,12 @@ CoalesceEngine::processNextApplyEvent()
         pkt->allocate();
         pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
 
-        bool atom_active = false;
+        bool atom_active_now = false;
         for (int index = 0; index < numElementsPerLine; index++) {
-            atom_active |= graphWorkload->activeCondition(items[index]);
+            atom_active_now |= items[index].activeNow;
         }
         // NOTE: If the atom is not active anymore.
-        if (!atom_active) {
+        if (!atom_active_now) {
             PacketPtr wb_pkt = createWritePacket(pkt->getAddr(),
                                         peerMemoryAtomSize, (uint8_t*) items);
             postPushWBQueue.emplace_back(wb_pkt, curTick());
@@ -946,9 +1038,10 @@ CoalesceEngine::processNextApplyEvent()
             int block_index = activeCacheBlocks.front();
             if (cacheBlocks[block_index].state == CacheState::IDLE) {
                 for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
-                    if (graphWorkload->activeCondition(cacheBlocks[block_index].items[index])) {
+                    if (cacheBlocks[block_index].items[index].activeNow) {
                         Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem);
                         uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]);
+                        cacheBlocks[block_index].items[index].activeNow = false;
                         cacheBlocks[block_index].dirty = true;
                         owner->recvVertexPush(addr, delta,
                             cacheBlocks[block_index].items[index].edgeIndex,
@@ -959,20 +1052,20 @@ CoalesceEngine::processNextApplyEvent()
                     }
                 }
 
-                bool atom_active = false;
+                bool atom_active_now = false;
                 for (int index = 0; index < numElementsPerLine; index++) {
-                    atom_active |= graphWorkload->activeCondition(cacheBlocks[block_index].items[index]);
+                    atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
                 }
                 // NOTE: If we have reached the last item in the cache block
-                if (!atom_active) {
-                    activeCacheBlocks.erase(block_index);
+                if (!atom_active_now) {
+                    currentActiveCacheBlocks.erase(block_index);
                 }
                 break;
             }
             // NOTE: If the block with index at the front of activeCacheBlocks
             // is not in IDLE state, then roll the that index to the back
-            activeCacheBlocks.pop_front();
-            activeCacheBlocks.push_back(block_index);
+            currentActiveCacheBlocks.pop_front();
+            currentActiveCacheBlocks.push_back(block_index);
             // NOTE: If we have visited all the items initially in the FIFO.
             num_visited_indices++;
             if (num_visited_indices == initial_fifo_length) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index f605704b6d..39f2491232 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -96,7 +96,9 @@ class CoalesceEngine : public BaseMemoryEngine
     };
 
     MPU* owner;
-    WorkDirectory* directory;
+    ProcessingMode mode;
+    WorkDirectory* currentDirectory;
+    WorkDirectory* futureDirectory;
     GraphWorkload* graphWorkload;
 
     Addr lastAtomAddr;
@@ -114,8 +116,9 @@ class CoalesceEngine : public BaseMemoryEngine
 
     // Tracking work in cache
     int pullsReceived;
-    // NOTE: Remember to erase from this upon eviction from cache
-    UniqueFIFO<int> activeCacheBlocks;
+    // NOTE: Remember to erase from these upon eviction from cache
+    UniqueFIFO<int> currentActiveCacheBlocks;
+    UniqueFIFO<int> futureActiveCacheBlocks;
 
     int pullsScheduled;
     int pendingPullLimit;
@@ -195,12 +198,14 @@ class CoalesceEngine : public BaseMemoryEngine
     CoalesceEngine(const Params &params);
     void registerMPU(MPU* mpu);
 
+    void setProcessingMode(ProcessingMode _mode) { mode = _mode; }
+    void createAsyncPopCountDirectory(int atoms_per_block);
+    void createBSPPopCountDirectory(int atoms_per_block);
     void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
-    virtual void recvFunctional(PacketPtr pkt);
 
+    virtual void recvFunctional(PacketPtr pkt);
     void postMemInitSetup();
-
-    void createPopCountDirectory(int atoms_per_block);
+    void swapDirectories();
 
     ReadReturnStatus recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index de5d569c18..83f3033427 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -39,7 +39,6 @@ const char* cacheStateStrings[NUM_CACHE_STATE] = {
     "PENDING_WB"
 };
 
-
 const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS] =
 {
     "ACCEPT",
@@ -53,4 +52,18 @@ const char* readDestinationStrings[NUM_READ_DESTINATION] =
     "READ_FOR_PUSH"
 };
 
+const char* processingModeStrings[NUM_PROCESSING_MODE] =
+{
+    "NOT_SET",
+    "ASYNCHRONOUS",
+    "BULK_SYNCHRONOUS"
+};
+
+const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE] =
+{
+    "NOT_SET",
+    "CONSUMING",
+    "APPLYING"
+};
+
 } // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 6153386b71..f6d199bf7d 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -60,6 +60,24 @@ enum ReadDestination
 };
 extern const char* readDestinationStrings[NUM_READ_DESTINATION];
 
+enum ProcessingMode
+{
+    NOT_SET,
+    ASYNCHRONOUS,
+    BULK_SYNCHRONOUS,
+    NUM_PROCESSING_MODE
+};
+extern const char* processingModeStrings[NUM_PROCESSING_MODE];
+
+enum BulkSynchronousStates
+{
+    NOT_SET,
+    CONSUMING,
+    APPLYING,
+    NUM_BULK_SYNCHRONOUS_STATE,
+}
+extern const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE];
+
 } // namespace gem5
 
 #endif // __ACCL_GRAPH_SEGA_ENUMS_HH__
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index ad18a0d5a5..358394ffc5 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -63,6 +63,10 @@ class MPU : public SimObject
     MPU(const Params& params);
     void registerCenteralController(CenteralController* centeral_controller);
 
+    void setProcessingMode(ProcessingMode mode) { coalesceEngine->setProcessingMode(mode); }
+    void createAsyncPopCountDirectory(int atoms_per_block) { coalseceEngine->createAsyncPopCountDirectory(atoms_per_block); }
+    void createBSPPopCountDirectory(int atoms_per_block) { coalseceEngine->createBSPPopCountDirectory(atoms_per_block); }
+
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
     void postMemInitSetup() { coalesceEngine->postMemInitSetup(); }

From 454e1e3a81c2818ea532183335fd94e731899326 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:12:57 -0800
Subject: [PATCH 224/247] Fixing enums

---
 src/accl/graph/sega/centeral_controller.cc | 5 ++++-
 src/accl/graph/sega/enums.hh               | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 6c924a4703..6e5f3ffcec 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -113,6 +113,9 @@ CenteralController::startup()
 
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 
+    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        state = BulkSynchronousStates::CONSUMING;
+    }
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
         if (!mpu->running() && (mpu->workCount() > 0)) {
@@ -149,7 +152,7 @@ CenteralController::recvDoneSignal()
     }
 
     if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        assert(state != BulkSynchronousState::DONT_CARE);
+        assert(state != BulkSynchronousState::NOT_SET);
         if (state == BulkSynchronousState::APPLYING) {
             // TODO:
             // 1- Toggle directories
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index f6d199bf7d..8280f122c3 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -75,7 +75,7 @@ enum BulkSynchronousStates
     CONSUMING,
     APPLYING,
     NUM_BULK_SYNCHRONOUS_STATE,
-}
+};
 extern const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE];
 
 } // namespace gem5

From f4b8685a29d80717374c2d222bfc96e5cec25266 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:15:52 -0800
Subject: [PATCH 225/247] Further fixes for enums.

---
 src/accl/graph/sega/centeral_controller.cc | 4 ++--
 src/accl/graph/sega/enums.cc               | 2 +-
 src/accl/graph/sega/enums.hh               | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 6e5f3ffcec..c6b9cf7a52 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -44,7 +44,7 @@ CenteralController::CenteralController(const Params& params):
     ClockedObject(params),
     system(params.system),
     mode(ProcessingMode::NOT_SET),
-    state(BulkSynchronousState::NOT_SET)
+    state(BulkSynchronousState::DONT_CARE)
 {
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
@@ -152,7 +152,7 @@ CenteralController::recvDoneSignal()
     }
 
     if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        assert(state != BulkSynchronousState::NOT_SET);
+        assert(state != BulkSynchronousState::DONT_CARE);
         if (state == BulkSynchronousState::APPLYING) {
             // TODO:
             // 1- Toggle directories
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index 83f3033427..099594e9eb 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -61,7 +61,7 @@ const char* processingModeStrings[NUM_PROCESSING_MODE] =
 
 const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE] =
 {
-    "NOT_SET",
+    "DONT_CARE",
     "CONSUMING",
     "APPLYING"
 };
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 8280f122c3..4c94412c9b 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -71,7 +71,7 @@ extern const char* processingModeStrings[NUM_PROCESSING_MODE];
 
 enum BulkSynchronousStates
 {
-    NOT_SET,
+    DONT_CARE,
     CONSUMING,
     APPLYING,
     NUM_BULK_SYNCHRONOUS_STATE,

From c3fd13291d5a4ecf5e43713888a4de11769b05a4 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:21:12 -0800
Subject: [PATCH 226/247] Fixing typos

---
 src/accl/graph/sega/enums.hh | 2 +-
 src/accl/graph/sega/mpu.hh   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 4c94412c9b..969ee8a976 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -69,7 +69,7 @@ enum ProcessingMode
 };
 extern const char* processingModeStrings[NUM_PROCESSING_MODE];
 
-enum BulkSynchronousStates
+enum BulkSynchronousState
 {
     DONT_CARE,
     CONSUMING,
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 358394ffc5..7d75e3e0b7 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -64,8 +64,8 @@ class MPU : public SimObject
     void registerCenteralController(CenteralController* centeral_controller);
 
     void setProcessingMode(ProcessingMode mode) { coalesceEngine->setProcessingMode(mode); }
-    void createAsyncPopCountDirectory(int atoms_per_block) { coalseceEngine->createAsyncPopCountDirectory(atoms_per_block); }
-    void createBSPPopCountDirectory(int atoms_per_block) { coalseceEngine->createBSPPopCountDirectory(atoms_per_block); }
+    void createAsyncPopCountDirectory(int atoms_per_block) { coalesceEngine->createAsyncPopCountDirectory(atoms_per_block); }
+    void createBSPPopCountDirectory(int atoms_per_block) { coalesceEngine->createBSPPopCountDirectory(atoms_per_block); }
 
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }

From 513e3f6beb77eb97902f9c0eafd5791b4dc9dcff Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:31:55 -0800
Subject: [PATCH 227/247] Fixing typos.

---
 src/accl/graph/sega/centeral_controller.cc |  2 +-
 src/accl/graph/sega/coalesce_engine.cc     | 21 +++++++++++++--------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index c6b9cf7a52..df1abbedc3 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -114,7 +114,7 @@ CenteralController::startup()
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 
     if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        state = BulkSynchronousStates::CONSUMING;
+        state = BulkSynchronousState::CONSUMING;
     }
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index bfe3fe21b8..6efafbb76c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -124,7 +124,7 @@ CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block)
 {
     currentDirectory = new PopCountDirectory(
                         peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
-    futureDirectory = new PopCountDirectroy(
+    futureDirectory = new PopCountDirectory(
                         peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
 }
 
@@ -390,7 +390,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             bool atom_active_now = false;
             bool atom_active_future = false;
             for (int index = 0; index < numElementsPerLine; index++) {
-                atom_active_now |= cacheBlocks[block_inde].items[index].activeNow;
+                atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
                 atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
             }
             if (atom_active_now) {
@@ -453,12 +453,17 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             WorkListItem items[numElementsPerLine];
             pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
             bool atom_active_now = false;
+            bool atom_active_future = false;
             for (int index = 0; index < numElementsPerLine; index++) {
-                atom_active |= items[index].activeNow;
+                atom_active_now |= items[index].activeNow;
+                atom_active_future |= items[index].activeFuture;
             }
             if (atom_active_now) {
                 // TODO: Add sampling of blockActiveCount and frontierSize here
                 int count = currentDirectory->deactivate(addr);
+                if (atom_active_future) {
+                    int count_2 = futureDirectory->deactivate(addr);
+                }
                 activeBuffer.emplace_back(pkt, curTick());
                 // stats.blockActiveCount.sample(count);
                 // stats.frontierSize.sample(directory->workCount());
@@ -573,7 +578,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]);
     cacheBlocks[block_index].items[wl_offset] = wl;
     if (mode == ProcessingMode::ASYNCHRONOUS) {
-        cacheBlocks[block_index].activeNow |= active;
+        cacheBlocks[block_index].items[wl_offset].activeNow |= active;
         if (active && (!currentActiveCacheBlocks.find(block_index))) {
             currentActiveCacheBlocks.push_back(block_index);
             if (!owner->running()) {
@@ -582,7 +587,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
         }
     }
     if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        cacheBlocks[block_index].activeFuture |= active;
+        cacheBlocks[block_index].items[wl_offset].activeFuture |= active;
         if (active && (!futureActiveCacheBlocks.find(block_index))) {
             futureActiveCacheBlocks.push_back(block_index);
         }
@@ -903,7 +908,7 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
             WorkListItem items[numElementsPerLine];
             wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
             bool atom_active_future = false;
-            for (int index = 0; index < numElementPerLine; index++) {
+            for (int index = 0; index < numElementsPerLine; index++) {
                 atom_active_future |= items[index].activeFuture;
             }
             if (atom_active_future) {
@@ -967,7 +972,7 @@ CoalesceEngine::recvMemRetry()
 int
 CoalesceEngine::workCount()
 {
-    return activeCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size();
+    return currentActiveCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size();
 }
 
 void
@@ -1031,7 +1036,7 @@ CoalesceEngine::processNextApplyEvent()
             }
             delete pkt;
         }
-    } else if (!activeCacheBlocks.empty()) {
+    } else if (!currentActiveCacheBlocks.empty()) {
         int num_visited_indices = 0;
         int initial_fifo_length = activeCacheBlocks.size();
         while (true) {

From d9ae6bed35e40240d7f6c80eb4c37b816099885d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:41:42 -0800
Subject: [PATCH 228/247] Fixing typos.

---
 src/accl/graph/sega/centeral_controller.cc | 2 +-
 src/accl/graph/sega/coalesce_engine.cc     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index df1abbedc3..db0f7941ed 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -68,7 +68,7 @@ void
 CenteralController::createPopCountDirectory(int atoms_per_block)
 {
     fatal_if(mode == ProcessingMode::NOT_SET, "You should set the processing "
-                        "mode by calling either setAsyncMode or setBSPMode.")
+                        "mode by calling either setAsyncMode or setBSPMode.");
     if (mode == ProcessingMode::ASYNCHRONOUS) {
         for (auto mpu: mpuVector) {
             mpu->createAsyncPopCountDirectory(atoms_per_block);
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 6efafbb76c..e3c194566a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -1038,9 +1038,9 @@ CoalesceEngine::processNextApplyEvent()
         }
     } else if (!currentActiveCacheBlocks.empty()) {
         int num_visited_indices = 0;
-        int initial_fifo_length = activeCacheBlocks.size();
+        int initial_fifo_length = crrentActiveCacheBlocks.size();
         while (true) {
-            int block_index = activeCacheBlocks.front();
+            int block_index = currentActiveCacheBlocks.front();
             if (cacheBlocks[block_index].state == CacheState::IDLE) {
                 for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
                     if (cacheBlocks[block_index].items[index].activeNow) {

From 37ec3ddacd9e25127f5ee90a7341956549bc731d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:54:47 -0800
Subject: [PATCH 229/247] Debug.

---
 src/accl/graph/base/graph_workload.cc      | 74 +++++++++++++++++++++-
 src/accl/graph/base/graph_workload.hh      | 36 +++++------
 src/accl/graph/sega/centeral_controller.cc | 10 +--
 src/accl/graph/sega/centeral_controller.hh |  2 +-
 src/accl/graph/sega/coalesce_engine.cc     |  2 +-
 5 files changed, 97 insertions(+), 27 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index e36c074da9..a78b3c1526 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -92,9 +92,9 @@ BFSWorkload::propagate(uint32_t value, uint32_t weight)
 }
 
 bool
-BFSWorkload::activeCondition(WorkListItem wl)
+BFSWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
 {
-    return (wl.tempProp < wl.prop) && (wl.degree > 0);
+    return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree; > 0);
 }
 
 uint32_t
@@ -298,6 +298,76 @@ PRWorkload::printWorkListItem(const WorkListItem wl)
             "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
             temp_float, prop_float, wl.degree, wl.edgeIndex);
 }
+// void
+// PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+// {
+//     size_t pkt_size = pkt->getSize();
+//     int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+//     WorkListItem items[num_elements];
+
+//     pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+//     bool atom_active = false;
+//     for (int i = 0; i < num_elements; i++) {
+//         items[i].tempProp = readFromFloat<uint32_t>(0);
+//         items[i].prop = readFromFloat<uint32_t>(1 - alpha);
+//         atom_active |= activeCondition(items[i]);
+//     }
+//     if (atom_active) {
+//         dir->activate(pkt->getAddr());
+//     }
+//     pkt->deleteData();
+//     pkt->allocate();
+//     pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+// }
+
+// uint32_t
+// PRWorkload::reduce(uint32_t update, uint32_t value)
+// {
+//     float update_float = writeToFloat<uint32_t>(update);
+//     float value_float = writeToFloat<uint32_t>(value);
+//     return readFromFloat<uint32_t>(update_float + value_float);
+// }
+
+// uint32_t
+// PRWorkload::propagate(uint32_t value, uint32_t weight)
+// {
+//     float value_float = writeToFloat<uint32_t>(value);
+//     float weight_float = writeToFloat<uint32_t>(weight);
+//     if (weight == 0) {
+//         weight_float = 1.0;
+//     }
+//     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+// }
+
+// bool
+// PRWorkload::activeCondition(WorkListItem wl)
+// {
+//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//     float prop_float = writeToFloat<uint32_t>(wl.prop);
+//     float dist = std::abs(temp_float - prop_float);
+//     return (dist >= threshold) && (wl.degree > 0);
+// }
+
+// uint32_t
+// PRWorkload::apply(WorkListItem& wl)
+// {
+//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//     float prop_float = writeToFloat<uint32_t>(wl.prop);
+//     float delta = (temp_float - prop_float) / wl.degree;
+//     uint32_t delta_uint = readFromFloat<uint32_t>(delta);
+//     wl.prop = wl.tempProp;
+//     return delta_uint;
+// }
+
+// std::string
+// PRWorkload::printWorkListItem(const WorkListItem wl)
+// {
+//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//     float prop_float = writeToFloat<uint32_t>(wl.prop);
+//     return csprintf(
+//             "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
+//             temp_float, prop_float, wl.degree, wl.edgeIndex);
+// }
 
 void
 CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 14a6561ae3..8e27d16bf9 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -72,7 +72,7 @@ class BFSWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
-    virtual bool activeCondition(WorkListItem wl);
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
@@ -119,26 +119,26 @@ class SSSPWorkload : public GraphWorkload
 };
 
 
-class PRWorkload : public GraphWorkload
-{
-  private:
-    float alpha;
-    float threshold;
+// class PRWorkload : public GraphWorkload
+// {
+//   private:
+//     float alpha;
+//     float threshold;
 
-  public:
-    PRWorkload(float alpha, float threshold):
-        alpha(alpha), threshold(threshold)
-    {}
+//   public:
+//     PRWorkload(float alpha, float threshold):
+//         alpha(alpha), threshold(threshold)
+//     {}
 
-    ~PRWorkload() {}
+//     ~PRWorkload() {}
 
-    virtual void init(PacketPtr pkt, WorkDirectory* dir);
-    virtual uint32_t reduce(uint32_t update, uint32_t value);
-    virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual uint32_t apply(WorkListItem& wl);
-    virtual bool activeCondition(WorkListItem wl);
-    virtual std::string printWorkListItem(const WorkListItem wl);
-};
+//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
+//     virtual uint32_t reduce(uint32_t update, uint32_t value);
+//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
+//     virtual uint32_t apply(WorkListItem& wl);
+//     virtual bool activeCondition(WorkListItem wl);
+//     virtual std::string printWorkListItem(const WorkListItem wl);
+// };
 
 class CCWorkload : public GraphWorkload
 {
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index db0f7941ed..7de6f61b56 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -58,11 +58,11 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
     workload = new BFSWorkload(init_addr, init_value);
 }
 
-void
-CenteralController::createPRWorkload(float alpha, float threshold)
-{
-    workload = new PRWorkload(alpha, threshold);
-}
+// void
+// CenteralController::createPRWorkload(float alpha, float threshold)
+// {
+//     workload = new PRWorkload(alpha, threshold);
+// }
 
 void
 CenteralController::createPopCountDirectory(int atoms_per_block)
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index ab0e0c0c09..b32dc38385 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -71,7 +71,7 @@ class CenteralController : public ClockedObject
     void createPopCountDirectory(int atoms_per_block);
 
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
-    void createPRWorkload(float alpha, float threshold);
+    // void createPRWorkload(float alpha, float threshold);
 
     void recvDoneSignal();
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index e3c194566a..6b44f7395b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -1038,7 +1038,7 @@ CoalesceEngine::processNextApplyEvent()
         }
     } else if (!currentActiveCacheBlocks.empty()) {
         int num_visited_indices = 0;
-        int initial_fifo_length = crrentActiveCacheBlocks.size();
+        int initial_fifo_length = currentActiveCacheBlocks.size();
         while (true) {
             int block_index = currentActiveCacheBlocks.front();
             if (cacheBlocks[block_index].state == CacheState::IDLE) {

From 4abd1cd5ec0e131cd56a741395e7ffe1bcdb2dd0 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 14:06:44 -0800
Subject: [PATCH 230/247] Debugging.

---
 src/accl/graph/base/graph_workload.cc     | 8 +++++---
 src/accl/graph/sega/CenteralController.py | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index a78b3c1526..50024965a1 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -67,12 +67,14 @@ BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir)
         WorkListItem items[num_elements];
 
         pkt->writeDataToBlock((uint8_t*) items, pkt_size);
-
         int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
-        items[index].tempProp = initValue;
-        if (activeCondition(items[index])) {
+        WorkListItem new_wl = items[index];
+        new_wl.tempProp = initValue;
+        if (activeCondition(new_wl, items[index])) {
             dir->activate(aligned_addr);
         }
+        items[index] = new_wl;
+
         pkt->deleteData();
         pkt->allocate();
         pkt->setDataFromBlock((uint8_t*) items, pkt_size);
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 8b43c90102..6de9e03a1c 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -46,6 +46,6 @@ class CenteralController(ClockedObject):
                     PyBindMethod("setBSPMode"),
                     PyBindMethod("createPopCountDirectory"),
                     PyBindMethod("createBFSWorkload"),
-                    PyBindMethod("createPRWorkload"),
+                    # PyBindMethod("createPRWorkload"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]

From 32a0f813e93accd59bd0f8d70430d9d5972d6317 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 14:08:17 -0800
Subject: [PATCH 231/247] Typos.

---
 src/accl/graph/base/graph_workload.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 50024965a1..9c21a3932a 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -96,7 +96,7 @@ BFSWorkload::propagate(uint32_t value, uint32_t weight)
 bool
 BFSWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
 {
-    return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree; > 0);
+    return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree > 0);
 }
 
 uint32_t

From 1352e207854c3f38670358efa991967ecb0a3089 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 14:14:41 -0800
Subject: [PATCH 232/247] Debugging.

---
 src/accl/graph/base/graph_workload.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 9c21a3932a..8536c2bbd8 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -71,6 +71,7 @@ BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir)
         WorkListItem new_wl = items[index];
         new_wl.tempProp = initValue;
         if (activeCondition(new_wl, items[index])) {
+            new_wl.activeNow = true;
             dir->activate(aligned_addr);
         }
         items[index] = new_wl;

From f13057c8ad23d2c91203cf2ac151ce3cd54f4169 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Nov 2022 00:58:54 -0800
Subject: [PATCH 233/247] Finalizing bsp and pr.

---
 configs/accl/bfs.py                        |   3 +-
 configs/accl/pr.py                         |  28 +++--
 configs/accl/sega.py                       |  22 ++--
 configs/accl/sega_simple.py                |  21 ++--
 src/accl/graph/base/graph_workload.cc      | 131 ++++++---------------
 src/accl/graph/base/graph_workload.hh      |  34 +++---
 src/accl/graph/sega/CenteralController.py  |   3 +-
 src/accl/graph/sega/centeral_controller.cc |  46 ++++----
 src/accl/graph/sega/centeral_controller.hh |   4 +-
 src/accl/graph/sega/coalesce_engine.cc     |  63 ++++++++++
 src/accl/graph/sega/coalesce_engine.hh     |   2 +
 src/accl/graph/sega/enums.cc               |   7 --
 src/accl/graph/sega/enums.hh               |   9 --
 src/accl/graph/sega/mpu.hh                 |   2 +
 14 files changed, 193 insertions(+), 182 deletions(-)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 806aa8a915..ab5de485b1 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -88,7 +88,7 @@ def get_inputs():
         sample,
         verify,
     ) = get_inputs()
-    
+
     if simple:
         from sega_simple import SEGA
     else:
@@ -98,6 +98,7 @@ def get_inputs():
 
     m5.instantiate()
 
+    system.set_async_mode()
     system.create_pop_count_directory(64)
     system.create_bfs_workload(init_addr, init_value)
     if sample:
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index e3d7c764ad..ea8a103640 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -35,9 +35,9 @@ def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("iterations", type=int)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("alpha", type=float)
-    argparser.add_argument("threshold", type=float)
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -69,8 +69,8 @@ def get_inputs():
         args.num_gpts,
         args.cache_size,
         args.graph,
+        args.iterations,
         args.alpha,
-        args.threshold,
         args.simple,
         args.sample,
         args.verify,
@@ -82,13 +82,13 @@ def get_inputs():
         num_gpts,
         cache_size,
         graph,
+        iterations,
         alpha,
-        threshold,
         simple,
         sample,
         verify,
     ) = get_inputs()
-    
+
     if simple:
         from sega_simple import SEGA
     else:
@@ -98,8 +98,9 @@ def get_inputs():
 
     m5.instantiate()
 
+    system.set_bsp_mode()
     system.create_pop_count_directory(64)
-    system.create_pr_workload(alpha, threshold)
+    system.create_pr_workload(alpha)
     if sample:
         while True:
             exit_event = m5.simulate(100000000)
@@ -112,11 +113,16 @@ def get_inputs():
             if exit_event.getCause() != "simulate() limit reached":
                 break
     else:
-        exit_event = m5.simulate()
-        print(
-            f"Exited simulation at tick {m5.curTick()} "
-            + f"because {exit_event.getCause()}"
-        )
+        iteration = 0
+        while iteration < iterations:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            iteration += 1
+            if system.work_count() == 0:
+                break
+    print(f"#iterations: {iteration}")
     if verify:
         system.print_answer()
-
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 1ea36ea49e..07e1b36d9d 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -73,8 +73,8 @@ def __init__(
         )
 
         self.edge_mem_ctrl = MemCtrl(
-            dram=DDR4_2400_8x8(
-                range=AddrRange(edge_memory_size), in_addr_map=False)
+            dram=
+            DDR4_2400_8x8(range=AddrRange(edge_memory_size), in_addr_map=False)
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
@@ -124,7 +124,7 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("2GiB", cache_size)
+            gpt = GPT("16GiB", cache_size)
             gpt.set_vertex_range(
                 [vertex_ranges[i], vertex_ranges[i + num_mpus]]
             )
@@ -139,15 +139,23 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
+    def work_count(self):
+        return self.ctrl.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.setBSPMode()
+
     def create_pop_count_directory(self, atoms_per_block):
-        for gpt in self.gpts:
-            gpt.coalesce_engine.createPopCountDirectory(atoms_per_block)
+        self.ctrl.createPopCountDirectory(atoms_per_block)
 
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
-    def create_pr_workload(self, alpha, threshold):
-        self.ctrl.createPRWorkload(alpha, threshold)
+    def create_pr_workload(self, alpha):
+        self.ctrl.createPRWorkload(alpha)
 
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index f59fa71a79..8727a4c90d 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -66,9 +66,9 @@ def __init__(
             max_propagates_per_cycle=8,
             update_queue_size=32,
         )
-        
+
         self.vertex_mem_ctrl = SimpleMemory(latency="122ns", latency_var="0ns", bandwidth="28GiB/s")
-        
+
         self.edge_mem_ctrl = MemCtrl(
             dram=DDR4_2400_8x8(
                 range=AddrRange(edge_memory_size), in_addr_map=False)
@@ -129,16 +129,23 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
+    def work_count(self):
+        return self.ctrl.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.setBSPMode()
+
     def create_pop_count_directory(self, atoms_per_block):
-        for gpt in self.gpts:
-            gpt.coalesce_engine.createPopCountDirectory(atoms_per_block)
+        self.ctrl.createPopCountDirectory(atoms_per_block)
 
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
-    def create_pr_workload(self, alpha, threshold):
-        self.ctrl.createPRWorkload(alpha, threshold)
+    def create_pr_workload(self, alpha):
+        self.ctrl.createPRWorkload(alpha)
 
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
-
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 8536c2bbd8..1fa2b287c4 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -111,9 +111,11 @@ std::string
 BFSWorkload::printWorkListItem(const WorkListItem wl)
 {
     return csprintf(
-            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
-            );
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
 }
 
 void
@@ -232,7 +234,7 @@ SSSPWorkload::printWorkListItem(const WorkListItem wl)
 
 
 void
-PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
     size_t pkt_size = pkt->getSize();
     int num_elements = (int) (pkt_size / sizeof(WorkListItem));
@@ -241,9 +243,12 @@ PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
     pkt->writeDataToBlock((uint8_t*) items, pkt_size);
     bool atom_active = false;
     for (int i = 0; i < num_elements; i++) {
-        items[i].tempProp = readFromFloat<uint32_t>(0);
-        items[i].prop = readFromFloat<uint32_t>(1 - alpha);
-        atom_active |= activeCondition(items[i]);
+        WorkListItem new_wl = items[i];
+        new_wl.tempProp = readFromFloat<uint32_t>(1 - alpha);
+        new_wl.prop = readFromFloat<uint32_t>(1);
+        new_wl.activeNow = activeCondition(new_wl, items[i]);
+        atom_active |= new_wl.activeNow;
+        items[i] = new_wl;
     }
     if (atom_active) {
         dir->activate(pkt->getAddr());
@@ -254,7 +259,7 @@ PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 }
 
 uint32_t
-PRWorkload::reduce(uint32_t update, uint32_t value)
+BSPPRWorkload::reduce(uint32_t update, uint32_t value)
 {
     float update_float = writeToFloat<uint32_t>(update);
     float value_float = writeToFloat<uint32_t>(value);
@@ -262,115 +267,47 @@ PRWorkload::reduce(uint32_t update, uint32_t value)
 }
 
 uint32_t
-PRWorkload::propagate(uint32_t value, uint32_t weight)
+BSPPRWorkload::propagate(uint32_t value, uint32_t weight)
 {
     float value_float = writeToFloat<uint32_t>(value);
-    float weight_float = writeToFloat<uint32_t>(weight);
-    if (weight == 0) {
-        weight_float = 1.0;
-    }
-    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+    return readFromFloat<uint32_t>(alpha * value_float);
 }
 
 bool
-PRWorkload::activeCondition(WorkListItem wl)
+BSPPRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
 {
-    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-    float prop_float = writeToFloat<uint32_t>(wl.prop);
-    float dist = std::abs(temp_float - prop_float);
-    return (dist >= threshold) && (wl.degree > 0);
+    return (old_wl.degree > 0);
 }
 
 uint32_t
-PRWorkload::apply(WorkListItem& wl)
+BSPPRWorkload::apply(WorkListItem& wl)
 {
-    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
     float prop_float = writeToFloat<uint32_t>(wl.prop);
-    float delta = (temp_float - prop_float) / wl.degree;
+    float delta = prop_float / wl.degree;
     uint32_t delta_uint = readFromFloat<uint32_t>(delta);
-    wl.prop = wl.tempProp;
     return delta_uint;
 }
 
+void
+BSPPRWorkload::interIterationInit(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    wl.tempProp = readFromFloat<uint32_t>(1 - alpha);
+    wl.activeFuture = (wl.degree > 0);
+}
+
 std::string
-PRWorkload::printWorkListItem(const WorkListItem wl)
+BSPPRWorkload::printWorkListItem(const WorkListItem wl)
 {
     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
     float prop_float = writeToFloat<uint32_t>(wl.prop);
     return csprintf(
-            "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
-            temp_float, prop_float, wl.degree, wl.edgeIndex);
-}
-// void
-// PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
-// {
-//     size_t pkt_size = pkt->getSize();
-//     int num_elements = (int) (pkt_size / sizeof(WorkListItem));
-//     WorkListItem items[num_elements];
-
-//     pkt->writeDataToBlock((uint8_t*) items, pkt_size);
-//     bool atom_active = false;
-//     for (int i = 0; i < num_elements; i++) {
-//         items[i].tempProp = readFromFloat<uint32_t>(0);
-//         items[i].prop = readFromFloat<uint32_t>(1 - alpha);
-//         atom_active |= activeCondition(items[i]);
-//     }
-//     if (atom_active) {
-//         dir->activate(pkt->getAddr());
-//     }
-//     pkt->deleteData();
-//     pkt->allocate();
-//     pkt->setDataFromBlock((uint8_t*) items, pkt_size);
-// }
-
-// uint32_t
-// PRWorkload::reduce(uint32_t update, uint32_t value)
-// {
-//     float update_float = writeToFloat<uint32_t>(update);
-//     float value_float = writeToFloat<uint32_t>(value);
-//     return readFromFloat<uint32_t>(update_float + value_float);
-// }
-
-// uint32_t
-// PRWorkload::propagate(uint32_t value, uint32_t weight)
-// {
-//     float value_float = writeToFloat<uint32_t>(value);
-//     float weight_float = writeToFloat<uint32_t>(weight);
-//     if (weight == 0) {
-//         weight_float = 1.0;
-//     }
-//     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
-// }
-
-// bool
-// PRWorkload::activeCondition(WorkListItem wl)
-// {
-//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//     float prop_float = writeToFloat<uint32_t>(wl.prop);
-//     float dist = std::abs(temp_float - prop_float);
-//     return (dist >= threshold) && (wl.degree > 0);
-// }
-
-// uint32_t
-// PRWorkload::apply(WorkListItem& wl)
-// {
-//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//     float prop_float = writeToFloat<uint32_t>(wl.prop);
-//     float delta = (temp_float - prop_float) / wl.degree;
-//     uint32_t delta_uint = readFromFloat<uint32_t>(delta);
-//     wl.prop = wl.tempProp;
-//     return delta_uint;
-// }
-
-// std::string
-// PRWorkload::printWorkListItem(const WorkListItem wl)
-// {
-//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//     float prop_float = writeToFloat<uint32_t>(wl.prop);
-//     return csprintf(
-//             "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
-//             temp_float, prop_float, wl.degree, wl.edgeIndex);
-// }
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_float, prop_float, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
 
 void
 CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 8e27d16bf9..fdd4928e10 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -51,6 +51,7 @@ class GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual uint32_t apply(WorkListItem& wl) = 0;
+    virtual void interIterationInit(WorkListItem& wl) = 0;
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0;
     virtual std::string printWorkListItem(const WorkListItem wl) = 0;
 };
@@ -72,6 +73,7 @@ class BFSWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
+    virtual void interIterationInit(WorkListItem& wl) {}
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
@@ -119,26 +121,24 @@ class SSSPWorkload : public GraphWorkload
 };
 
 
-// class PRWorkload : public GraphWorkload
-// {
-//   private:
-//     float alpha;
-//     float threshold;
+class BSPPRWorkload : public GraphWorkload
+{
+  private:
+    float alpha;
 
-//   public:
-//     PRWorkload(float alpha, float threshold):
-//         alpha(alpha), threshold(threshold)
-//     {}
+  public:
+    BSPPRWorkload(float alpha): alpha(alpha) {}
 
-//     ~PRWorkload() {}
+    ~BSPPRWorkload() {}
 
-//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
-//     virtual uint32_t reduce(uint32_t update, uint32_t value);
-//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
-//     virtual uint32_t apply(WorkListItem& wl);
-//     virtual bool activeCondition(WorkListItem wl);
-//     virtual std::string printWorkListItem(const WorkListItem wl);
-// };
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void interIterationInit(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
 
 class CCWorkload : public GraphWorkload
 {
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 6de9e03a1c..9dd8f41e61 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -46,6 +46,7 @@ class CenteralController(ClockedObject):
                     PyBindMethod("setBSPMode"),
                     PyBindMethod("createPopCountDirectory"),
                     PyBindMethod("createBFSWorkload"),
-                    # PyBindMethod("createPRWorkload"),
+                    PyBindMethod("createPRWorkload"),
+                    PyBindMethod("workCount"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 7de6f61b56..0103b1a0c4 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -43,8 +43,7 @@ namespace gem5
 CenteralController::CenteralController(const Params& params):
     ClockedObject(params),
     system(params.system),
-    mode(ProcessingMode::NOT_SET),
-    state(BulkSynchronousState::DONT_CARE)
+    mode(ProcessingMode::NOT_SET)
 {
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
@@ -58,11 +57,11 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
     workload = new BFSWorkload(init_addr, init_value);
 }
 
-// void
-// CenteralController::createPRWorkload(float alpha, float threshold)
-// {
-//     workload = new PRWorkload(alpha, threshold);
-// }
+void
+CenteralController::createPRWorkload(float alpha)
+{
+    workload = new BSPPRWorkload(alpha);
+}
 
 void
 CenteralController::createPopCountDirectory(int atoms_per_block)
@@ -113,9 +112,6 @@ CenteralController::startup()
 
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 
-    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        state = BulkSynchronousState::CONSUMING;
-    }
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
         if (!mpu->running() && (mpu->workCount() > 0)) {
@@ -152,20 +148,25 @@ CenteralController::recvDoneSignal()
     }
 
     if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        assert(state != BulkSynchronousState::DONT_CARE);
-        if (state == BulkSynchronousState::APPLYING) {
-            // TODO:
-            // 1- Toggle directories
-            // 2- Check if termination condition is met
-            // 3- If yes, schedule exit event,
-            // 4- If not switch state to consuming.
-            exitSimLoopNow("applying done.");
-        } else if (state == BulkSynchronousState::CONSUMING) {
-            // TODO:
-            // Schedule Bulk apply
-            exitSimLoopNow("consuming done.");
+        for (auto mpu: mpuVector) {
+            mpu->postConsumeProcess();
+            mpu->swapDirectories();
+            if (!mpu->running() && (mpu->workCount() > 0)) {
+                mpu->start();
+            }
         }
+        exitSimLoopNow("finished an iteration.");
+    }
+}
+
+int
+CenteralController::workCount()
+{
+    int work_count = 0;
+    for (auto mpu: mpuVector) {
+        work_count += mpu->workCount();
     }
+    return work_count;
 }
 
 void
@@ -184,7 +185,6 @@ CenteralController::printAnswerToHostSimout()
         }
         pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
         for (int i = 0; i < num_items; i++) {
-            workload->apply(items[i]);
             std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i,
                                         workload->printWorkListItem(items[i]));
 
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index b32dc38385..ab039e5024 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -50,7 +50,6 @@ class CenteralController : public ClockedObject
     Addr maxVertexAddr;
 
     ProcessingMode mode;
-    BulkSynchronousState state;
 
     std::vector<MPU*> mpuVector;
     std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
@@ -71,10 +70,11 @@ class CenteralController : public ClockedObject
     void createPopCountDirectory(int atoms_per_block);
 
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
-    // void createPRWorkload(float alpha, float threshold);
+    void createPRWorkload(float alpha);
 
     void recvDoneSignal();
 
+    int workCount();
     void printAnswerToHostSimout();
 };
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 6b44f7395b..32b946d29f 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -111,6 +111,69 @@ CoalesceEngine::postMemInitSetup()
     currentDirectory->setLastAtomAddr(lastAtomAddr);
 }
 
+void
+CoalesceEngine::postConsumeProcess()
+{
+    WorkListItem items[numElementsPerLine];
+    for (Addr addr = 0; addr <= lastAtomAddr; addr += peerMemoryAtomSize) {
+        int block_index = getBlockIndex(addr);
+        if (cacheBlocks[block_index].addr == addr) {
+            assert(cacheBlocks[block_index].valid);
+            assert(!cacheBlocks[block_index].hasConflict);
+            assert(cacheBlocks[block_index].state == CacheState::IDLE);
+            bool atom_active_future_before = false;
+            bool atom_active_future_after = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                assert(!cacheBlocks[block_index].items[index].activeNow);
+                // if (cacheBlocks[block_index].items[index].activeFuture) {
+                //     graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]);
+                //     cacheBlocks[block_index].items[index].activeNow = true;
+                //     cacheBlocks[block_index].items[index].activeFuture = false;
+                // }
+                atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture;
+                graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]);
+                atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture;
+                if (cacheBlocks[block_index].items[index].activeFuture) {
+                    cacheBlocks[block_index].items[index].activeFuture = false;
+                    cacheBlocks[block_index].items[index].activeNow = true;
+                }
+            }
+            if (!atom_active_future_before && atom_active_future_after) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
+            if (atom_active_future_before && !atom_active_future_after) {
+                futureActiveCacheBlocks.erase(block_index);
+            }
+        } else {
+            PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize);
+            memPort.sendFunctional(read_pkt);
+            read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            delete read_pkt;
+            bool atom_active_future_before = false;
+            bool atom_active_future_after = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                assert(!items[index].activeNow);
+                atom_active_future_before |= items[index].activeFuture;
+                graphWorkload->interIterationInit(items[index]);
+                atom_active_future_after |= items[index].activeFuture;
+                if (items[index].activeFuture) {
+                    items[index].activeFuture = false;
+                    items[index].activeNow = true;
+                }
+            }
+            if (!atom_active_future_before && atom_active_future_after) {
+                futureDirectory->activate(addr);
+            }
+            if (atom_active_future_before && !atom_active_future_after) {
+                futureDirectory->deactivate(addr);
+            }
+            PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items);
+            memPort.sendFunctional(write_pkt);
+            delete write_pkt;
+        }
+    }
+}
+
 void
 CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block)
 {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 39f2491232..c9d8e47f15 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -205,12 +205,14 @@ class CoalesceEngine : public BaseMemoryEngine
 
     virtual void recvFunctional(PacketPtr pkt);
     void postMemInitSetup();
+    void postConsumeProcess();
     void swapDirectories();
 
     ReadReturnStatus recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
     int workCount();
+    int futureWorkCount();
     void recvVertexPull();
 
     bool done();
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index 099594e9eb..f7ef96197f 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -59,11 +59,4 @@ const char* processingModeStrings[NUM_PROCESSING_MODE] =
     "BULK_SYNCHRONOUS"
 };
 
-const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE] =
-{
-    "DONT_CARE",
-    "CONSUMING",
-    "APPLYING"
-};
-
 } // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 969ee8a976..f97c33a0e0 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -69,15 +69,6 @@ enum ProcessingMode
 };
 extern const char* processingModeStrings[NUM_PROCESSING_MODE];
 
-enum BulkSynchronousState
-{
-    DONT_CARE,
-    CONSUMING,
-    APPLYING,
-    NUM_BULK_SYNCHRONOUS_STATE,
-};
-extern const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE];
-
 } // namespace gem5
 
 #endif // __ACCL_GRAPH_SEGA_ENUMS_HH__
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 7d75e3e0b7..04393db36d 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -70,6 +70,8 @@ class MPU : public SimObject
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
     void postMemInitSetup() { coalesceEngine->postMemInitSetup(); }
+    void postConsumeProcess() { coalesceEngine->postConsumeProcess(); }
+    void swapDirectories() { coalesceEngine->swapDirectories(); }
 
     bool handleIncomingUpdate(PacketPtr pkt);
 

From f59afb8fb699e6ae63af78d6e4dfc165696c319f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Nov 2022 11:17:39 -0800
Subject: [PATCH 234/247] Fixing a bug in async mode.

---
 configs/accl/sega.py                       |  2 +-
 configs/accl/sega_simple.py                |  2 +-
 src/accl/graph/sega/CenteralController.py  |  3 ++-
 src/accl/graph/sega/centeral_controller.cc | 10 +++++-----
 src/accl/graph/sega/coalesce_engine.cc     |  6 +++---
 5 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 07e1b36d9d..b5ce618f7f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -124,7 +124,7 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("16GiB", cache_size)
+            gpt = GPT("2GiB", cache_size)
             gpt.set_vertex_range(
                 [vertex_ranges[i], vertex_ranges[i + num_mpus]]
             )
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index 8727a4c90d..ff97134b47 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -117,7 +117,7 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("4GiB", cache_size)
+            gpt = GPT("2GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpts.append(gpt)
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 9dd8f41e61..f9544ec539 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -37,7 +37,8 @@ class CenteralController(ClockedObject):
 
     system = Param.System(Parent.any, "System this Engine is a part of")
 
-    image_file = Param.String("Path to the vertex image file.")
+    vertex_image_file = Param.String("Path to the vertex image file.")
+    edgelist_image_file = Param.String("Path to the edgelist image file.")
 
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 0103b1a0c4..c44789f9f0 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -89,7 +89,7 @@ CenteralController::startup()
         mpu->recvWorkload(workload);
     }
 
-    const auto& file = params().image_file;
+    const auto& vertex_file = params().vertex_image_file;
     if (file == "")
         return;
 
@@ -97,10 +97,10 @@ CenteralController::startup()
     fatal_if(!object, "%s: Could not load %s.", name(), file);
 
     loader::debugSymbolTable.insert(*object->symtab().globals());
-    loader::MemoryImage image = object->buildImage();
-    maxVertexAddr = image.maxAddr();
+    loader::MemoryImage vertex_image = object->buildImage();
+    maxVertexAddr = vertex_image.maxAddr();
 
-    PortProxy proxy(
+    PortProxy vertex_proxy(
     [this](PacketPtr pkt) {
         for (auto mpu: mpuVector) {
             AddrRangeList range_list = addrRangeListMap[mpu];
@@ -110,7 +110,7 @@ CenteralController::startup()
         }
     }, system->cacheLineSize());
 
-    panic_if(!image.write(proxy), "%s: Unable to write image.");
+    panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image.");
 
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 32b946d29f..35b2bf71cf 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -744,8 +744,6 @@ CoalesceEngine::processNextMemoryEvent()
         schedule(nextMemoryEvent, nextCycle());
     }
 
-    // FIXME: done() might have a different meaning depending on
-    // ProcessingMode and Processing state
     if (done()) {
         owner->recvDoneSignal();
     }
@@ -934,7 +932,9 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                 activeBuffer.emplace_back(pkt, curTick());
             } else {
                 int count = currentDirectory->activate(cacheBlocks[block_index].addr);
-                int count_2 = futureDirectory->activate(cacheBlocks[block_index].addr);
+                if (atom_active_future) {
+                    int count_2 = futureDirectory->activate(cacheBlocks[block_index].addr);
+                }
                 // stats.blockActiveCount.sample(count);
                 // stats.frontierSize.sample(directory->workCount());
                 memPort.sendPacket(pkt);

From 772795067298f974d713a6a605b0056e30bfe537 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Nov 2022 16:03:25 -0800
Subject: [PATCH 235/247] Debugging and removing typos. sega-ddr represent
 correct system config.

---
 configs/accl/sega-ddr/bfs.py               | 125 +++++++++++++
 configs/accl/sega-ddr/pr.py                | 128 +++++++++++++
 configs/accl/sega-ddr/sega.py              | 200 +++++++++++++++++++++
 src/accl/graph/sega/CenteralController.py  |   1 -
 src/accl/graph/sega/centeral_controller.cc |   6 +-
 src/accl/graph/sega/coalesce_engine.cc     |  68 ++++---
 src/accl/graph/sega/coalesce_engine.hh     |   9 +-
 7 files changed, 505 insertions(+), 32 deletions(-)
 create mode 100644 configs/accl/sega-ddr/bfs.py
 create mode 100644 configs/accl/sega-ddr/pr.py
 create mode 100644 configs/accl/sega-ddr/sega.py

diff --git a/configs/accl/sega-ddr/bfs.py b/configs/accl/sega-ddr/bfs.py
new file mode 100644
index 0000000000..8766822b33
--- /dev/null
+++ b/configs/accl/sega-ddr/bfs.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.init_addr,
+        args.init_value,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        init_addr,
+        init_value,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_bfs_workload(init_addr, init_value)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/sega-ddr/pr.py b/configs/accl/sega-ddr/pr.py
new file mode 100644
index 0000000000..ea8a103640
--- /dev/null
+++ b/configs/accl/sega-ddr/pr.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("iterations", type=int)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.iterations,
+        args.alpha,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        iterations,
+        alpha,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_bsp_mode()
+    system.create_pop_count_directory(64)
+    system.create_pr_workload(alpha)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        iteration = 0
+        while iteration < iterations:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            iteration += 1
+            if system.work_count() == 0:
+                break
+    print(f"#iterations: {iteration}")
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/sega-ddr/sega.py b/configs/accl/sega-ddr/sega.py
new file mode 100644
index 0000000000..c5545ee0f1
--- /dev/null
+++ b/configs/accl/sega-ddr/sega.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import log
+from m5.objects import *
+
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
+    return ret, intlv_low_bit + intlv_bits - 1
+
+
+class GPT(SubSystem):
+    def __init__(self, register_file_size: int, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+            update_queue_size=64, register_file_size=register_file_size
+        )
+        self.coalesce_engine = CoalesceEngine(
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=32,
+            active_buffer_size=64,
+            post_push_wb_queue_size=64,
+        )
+        self.push_engine = PushEngine(
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=4096,
+            max_propagates_per_cycle=8,
+            update_queue_size=32,
+        )
+
+        self.vertex_mem_ctrl = HBMCtrl(
+            dram=HBM_2000_4H_1x64(
+                page_policy="close", read_buffer_size=96, write_buffer_size=96
+            ),
+            dram_2=HBM_2000_4H_1x64(
+                page_policy="close", read_buffer_size=96, write_buffer_size=96
+            ),
+        )
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+
+        self.mpu = MPU(
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
+    def set_vertex_range(self, vertex_ranges):
+        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+
+    def set_vertex_pch_bit(self, pch_bit):
+        self.vertex_mem_ctrl.pch_bit = pch_bit
+
+
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
+        )
+        self.xbar = NoncoherentXBar(
+            width=8, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.dram.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
+class SEGA(System):
+    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
+        super(SEGA, self).__init__()
+        # num_gpts should be an even power of 2
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        # Building the CenteralController
+        self.ctrl = CenteralController(vertex_image_file=f"{graph_path}/vertices")
+        # Building the EdgeMemories
+        edge_mem = []
+        for i in range(int(num_gpts/2)):
+            mem = EdgeMemory("16GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
+        vertex_ranges, pch_bit = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32
+        )
+        gpts = []
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
+            gpt.set_vertex_range(
+                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
+            )
+            gpt.set_vertex_pch_bit(pch_bit)
+            gpt.setEdgeMemPort(self.edge_mem[i % (int(num_gpts/2))].getPort())
+            gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        self.gpts = gpts
+
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+    def work_count(self):
+        return self.ctrl.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.setBSPMode()
+
+    def create_pop_count_directory(self, atoms_per_block):
+        self.ctrl.createPopCountDirectory(atoms_per_block)
+
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.createBFSWorkload(init_addr, init_value)
+
+    def create_pr_workload(self, alpha):
+        self.ctrl.createPRWorkload(alpha)
+
+    def print_answer(self):
+        self.ctrl.printAnswerToHostSimout()
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index f9544ec539..bda2fa3d6a 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -38,7 +38,6 @@ class CenteralController(ClockedObject):
     system = Param.System(Parent.any, "System this Engine is a part of")
 
     vertex_image_file = Param.String("Path to the vertex image file.")
-    edgelist_image_file = Param.String("Path to the edgelist image file.")
 
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index c44789f9f0..26e4473b03 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -90,11 +90,11 @@ CenteralController::startup()
     }
 
     const auto& vertex_file = params().vertex_image_file;
-    if (file == "")
+    if (vertex_file == "")
         return;
 
-    auto* object = loader::createObjectFile(file, true);
-    fatal_if(!object, "%s: Could not load %s.", name(), file);
+    auto* object = loader::createObjectFile(vertex_file, true);
+    fatal_if(!object, "%s: Could not load %s.", name(), vertex_file);
 
     loader::debugSymbolTable.insert(*object->symtab().globals());
     loader::MemoryImage vertex_image = object->buildImage();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 35b2bf71cf..263e08d901 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -343,12 +343,14 @@ CoalesceEngine::recvWLRead(Addr addr)
                     if (atom_active_now) {
                         currentActiveCacheBlocks.erase(block_index);
                         int count = currentDirectory->activate(cacheBlocks[block_index].addr);
-                        // stats.blockActiveCount.sample(count);
-                        // stats.frontierSize.sample(directory->workCount());
+                        stats.currentFrontierSize.sample(currentDirectory->workCount());
+                        stats.currentBlockActiveCount.sample(count);
                     }
                     if (atom_active_future) {
                         futureActiveCacheBlocks.erase(block_index);
                         int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                        stats.futureFrontierSize.sample(futureDirectory->workCount());
+                        stats.futureBlockActiveCount.sample(count);
                     }
                     // NOTE: Bring the cache line to invalid state.
                     // NOTE: Above line where we set hasConflict to true
@@ -457,14 +459,16 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
             }
             if (atom_active_now) {
-                // TODO: Add sampling of blockActiveCount and frontierSize here
                 int count = currentDirectory->deactivate(addr);
                 currentActiveCacheBlocks.push_back(block_index);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
             }
             if (atom_active_future) {
-                // TODO: Add sampling of blockActiveCount and frontierSize here
                 int count = futureDirectory->deactivate(addr);
                 futureActiveCacheBlocks.push_back(block_index);
+                stats.futureFrontierSize.sample(futureDirectory->workCount());
+                stats.futureBlockActiveCount.sample(count);
             }
 
             assert(MSHR.find(block_index) != MSHR.end());
@@ -522,15 +526,17 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 atom_active_future |= items[index].activeFuture;
             }
             if (atom_active_now) {
-                // TODO: Add sampling of blockActiveCount and frontierSize here
                 int count = currentDirectory->deactivate(addr);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
                 if (atom_active_future) {
-                    int count_2 = futureDirectory->deactivate(addr);
+                    int count = futureDirectory->deactivate(addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
                 }
                 activeBuffer.emplace_back(pkt, curTick());
-                // stats.blockActiveCount.sample(count);
-                // stats.frontierSize.sample(directory->workCount());
             } else {
+                stats.wastefulBytesRead += pkt->getSize();
                 delete pkt;
             }
 
@@ -686,15 +692,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
                 }
                 if (atom_active_now) {
-                    // TODO: Sample frontier size and blockCount here.
                     currentActiveCacheBlocks.erase(block_index);
                     int count = currentDirectory->activate(cacheBlocks[block_index].addr);
-                    // stats.blockActiveCount.sample(count);
-                    // stats.frontierSize.sample(directory->workCount());
+                    stats.currentFrontierSize.sample(currentDirectory->workCount());
+                    stats.currentBlockActiveCount.sample(count);
                 }
                 if (atom_active_future) {
                     futureActiveCacheBlocks.erase(block_index);
                     int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
                 }
                 cacheBlocks[block_index].reset();
             }
@@ -932,17 +939,21 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                 activeBuffer.emplace_back(pkt, curTick());
             } else {
                 int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
                 if (atom_active_future) {
-                    int count_2 = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
                 }
-                // stats.blockActiveCount.sample(count);
-                // stats.frontierSize.sample(directory->workCount());
                 memPort.sendPacket(pkt);
                 onTheFlyReqs++;
             }
         } else {
             if (atom_active_future) {
                 int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                stats.futureFrontierSize.sample(futureDirectory->workCount());
+                stats.futureBlockActiveCount.sample(count);
             }
             memPort.sendPacket(pkt);
             onTheFlyReqs++;
@@ -956,7 +967,6 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                             "the current write back scheduled at tick %lu for "
                             "the right function scheduled later.\n",
                             __func__, block_index, schedule_tick);
-        stats.numInvalidWriteBacks++;
     }
 }
 
@@ -1141,8 +1151,8 @@ CoalesceEngine::processNextApplyEvent()
             }
         }
     } else {
-        DPRINTF(CoalesceEngine, "%s: Could not find "
-                        "work to apply.\n", __func__);
+        DPRINTF(CoalesceEngine, "%s: Could not find work to apply.\n", __func__);
+        stats.worklessCycles++;
     }
 
     if (pullCondition()) {
@@ -1184,6 +1194,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "delayed because of port shortage. "),
     ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
              "Number of times memory bandwidth was not available."),
+    ADD_STAT(wastefulBytesRead, statistics::units::Byte::get(),
+             "Number of bytes read that were not used by coalesce engine"),
     ADD_STAT(verticesPulled, statistics::units::Count::get(),
              "Number of times a pull request has been sent by PushEngine."),
     ADD_STAT(verticesPushed, statistics::units::Count::get(),
@@ -1192,8 +1204,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Time of the last pull request. (Relative to reset_stats)"),
     ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
              "Time of the last vertex push. (Relative to reset_stats)"),
-    ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(),
-             "Number of times a scheduled memory function has been invalid."),
+    ADD_STAT(worklessCycles, statistics::units::Count::get(),
+             "cycles the coalesce engine could not find work for apply"),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
     ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
@@ -1202,10 +1214,14 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
                                             statistics::units::Second>::get(),
              "Rate at which vertices are pushed."),
-    ADD_STAT(frontierSize, statistics::units::Count::get(),
-             "Histogram of the length of the bitvector."),
-    ADD_STAT(blockActiveCount, statistics::units::Count::get(),
-             "Histogram of the popCount values in the directory"),
+    ADD_STAT(currentFrontierSize, statistics::units::Count::get(),
+             "Histogram of the length of the current bitvector."),
+    ADD_STAT(futureFrontierSize, statistics::units::Count::get(),
+             "Histogram of the length of the future bitvector."),
+    ADD_STAT(currentBlockActiveCount, statistics::units::Count::get(),
+             "Histogram of the popCount values in the current directory"),
+    ADD_STAT(futureBlockActiveCount, statistics::units::Count::get(),
+             "Histogram of the popCount values in the future directory"),
     ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
              "Histogram of the response latency to WLEngine. (ns)"),
     ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
@@ -1225,8 +1241,10 @@ CoalesceEngine::CoalesceStats::regStats()
 
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
 
-    frontierSize.init(64);
-    blockActiveCount.init(64);
+    currentFrontierSize.init(64);
+    futureFrontierSize.init(64);
+    currentBlockActiveCount.init(64);
+    futureBlockActiveCount.init(64);
     responseQueueLatency.init(64);
     memoryFunctionLatency.init(64);
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c9d8e47f15..8ee17781fc 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -171,18 +171,21 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar numConflicts;
         statistics::Scalar responsePortShortage;
         statistics::Scalar numMemoryBlocks;
+        statistics::Scalar wastefulBytesRead;
         statistics::Scalar verticesPulled;
         statistics::Scalar verticesPushed;
         statistics::Scalar lastVertexPullTime;
         statistics::Scalar lastVertexPushTime;
-        statistics::Scalar numInvalidWriteBacks;
+        statistics::Scalar worklessCycles;
 
         statistics::Formula hitRate;
         statistics::Formula vertexPullBW;
         statistics::Formula vertexPushBW;
 
-        statistics::Histogram frontierSize;
-        statistics::Histogram blockActiveCount;
+        statistics::Histogram currentFrontierSize;
+        statistics::Histogram futureFrontierSize;
+        statistics::Histogram currentBlockActiveCount;
+        statistics::Histogram futureBlockActiveCount;
         statistics::Histogram responseQueueLatency;
         statistics::Histogram memoryFunctionLatency;
     };

From 93624ccbddc96f8a561c97a4864f6894d708d528 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Nov 2022 22:51:41 -0800
Subject: [PATCH 236/247] Debugging, finalizing the config and merging new
 workloads.

---
 configs/accl/sega-ddr/bfs.py               |  15 +-
 configs/accl/sega-ddr/cc.py                | 119 +++++++++++
 configs/accl/sega-ddr/sega.py              |  15 +-
 configs/accl/sega-ddr/sssp.py              | 125 +++++++++++
 src/accl/graph/base/graph_workload.cc      | 233 +++++++--------------
 src/accl/graph/base/graph_workload.hh      |  81 ++++---
 src/accl/graph/sega/CenteralController.py  |   3 +
 src/accl/graph/sega/centeral_controller.cc |  18 ++
 src/accl/graph/sega/centeral_controller.hh |   3 +
 src/accl/graph/sega/push_engine.cc         |   1 -
 10 files changed, 408 insertions(+), 205 deletions(-)
 create mode 100644 configs/accl/sega-ddr/cc.py
 create mode 100644 configs/accl/sega-ddr/sssp.py

diff --git a/configs/accl/sega-ddr/bfs.py b/configs/accl/sega-ddr/bfs.py
index 8766822b33..97f1b5dc21 100644
--- a/configs/accl/sega-ddr/bfs.py
+++ b/configs/accl/sega-ddr/bfs.py
@@ -39,6 +39,14 @@ def get_inputs():
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--visited",
+        dest="visited",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use visitation version of BFS",
+    )
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -73,6 +81,7 @@ def get_inputs():
         args.graph,
         args.init_addr,
         args.init_value,
+        args.visited,
         args.simple,
         args.sample,
         args.verify,
@@ -87,6 +96,7 @@ def get_inputs():
         graph,
         init_addr,
         init_value,
+        visited,
         simple,
         sample,
         verify,
@@ -103,7 +113,10 @@ def get_inputs():
 
     system.set_async_mode()
     system.create_pop_count_directory(64)
-    system.create_bfs_workload(init_addr, init_value)
+    if visited:
+        system.create_bfs_visited_workload(init_addr, init_value)
+    else:
+        system.create_bfs_workload(init_addr, init_value)
     if sample:
         while True:
             exit_event = m5.simulate(100000000)
diff --git a/configs/accl/sega-ddr/cc.py b/configs/accl/sega-ddr/cc.py
new file mode 100644
index 0000000000..9b6d2b587d
--- /dev/null
+++ b/configs/accl/sega-ddr/cc.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_cc_workload()
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/sega-ddr/sega.py b/configs/accl/sega-ddr/sega.py
index c5545ee0f1..8325cf7565 100644
--- a/configs/accl/sega-ddr/sega.py
+++ b/configs/accl/sega-ddr/sega.py
@@ -56,8 +56,8 @@ def __init__(self, register_file_size: int, cache_size: str):
             attached_memory_atom_size=32,
             cache_size=cache_size,
             max_resp_per_cycle=8,
-            pending_pull_limit=32,
-            active_buffer_size=64,
+            pending_pull_limit=64,
+            active_buffer_size=80,
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
@@ -121,7 +121,7 @@ def __init__(self, size: str):
             dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
         )
         self.xbar = NoncoherentXBar(
-            width=8, frontend_latency=1, forward_latency=1, response_latency=1
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
         )
         self.xbar.mem_side_ports = self.mem_ctrl.port
 
@@ -193,6 +193,15 @@ def create_pop_count_directory(self, atoms_per_block):
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.createCCWorkload()
+
     def create_pr_workload(self, alpha):
         self.ctrl.createPRWorkload(alpha)
 
diff --git a/configs/accl/sega-ddr/sssp.py b/configs/accl/sega-ddr/sssp.py
new file mode 100644
index 0000000000..f2e60b856a
--- /dev/null
+++ b/configs/accl/sega-ddr/sssp.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.init_addr,
+        args.init_value,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        init_addr,
+        init_value,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_sssp_workload(init_addr, init_value)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 1fa2b287c4..7471e4d073 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -118,90 +118,95 @@ BFSWorkload::printWorkListItem(const WorkListItem wl)
             wl.activeFuture ? "true" : "false");
 }
 
-void
-BFSVisitedWorkload::init(PacketPtr pkt, WorkDirectory* dir)
-{
-    size_t pkt_size = pkt->getSize();
-    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
-
-    if (pkt->getAddr() == aligned_addr) {
-        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
-        WorkListItem items[num_elements];
-
-        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
-
-        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
-        items[index].tempProp = initValue;
-        if (activeCondition(items[index])) {
-            dir->activate(aligned_addr);
-        }
-        pkt->deleteData();
-        pkt->allocate();
-        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
-    }
-}
+// void
+// BFSVisitedWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+// {
+//     size_t pkt_size = pkt->getSize();
+//     uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
+
+//     if (pkt->getAddr() == aligned_addr) {
+//         int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+//         WorkListItem items[num_elements];
+
+//         pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+
+//         int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
+//         items[index].tempProp = initValue;
+//         if (activeCondition(items[index])) {
+//             dir->activate(aligned_addr);
+//         }
+//         pkt->deleteData();
+//         pkt->allocate();
+//         pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+//     }
+// }
+
+// uint32_t
+// BFSVisitedWorkload::reduce(uint32_t update, uint32_t value)
+// {
+//     return std::min(update, value);
+// }
+
+// uint32_t
+// BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight)
+// {
+//     return 1;
+// }
+
+// bool
+// BFSVisitedWorkload::activeCondition(WorkListItem wl)
+// {
+//     return (wl.tempProp < wl.prop) && (wl.degree > 0);
+// }
+
+// uint32_t
+// BFSVisitedWorkload::apply(WorkListItem& wl)
+// {
+//     wl.prop = wl.tempProp;
+//     return wl.prop;
+// }
+
+// std::string
+// BFSVisitedWorkload::printWorkListItem(const WorkListItem wl)
+// {
+//     return csprintf(
+//             "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
+//             wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
+//             );
+// }
 
 uint32_t
-BFSVisitedWorkload::reduce(uint32_t update, uint32_t value)
-{
-    return std::min(update, value);
-}
-
-uint32_t
-BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight)
-{
-    return 1;
-}
-
-bool
-BFSVisitedWorkload::activeCondition(WorkListItem wl)
-{
-    return (wl.tempProp < wl.prop) && (wl.degree > 0);
-}
-
-uint32_t
-BFSVisitedWorkload::apply(WorkListItem& wl)
-{
-    wl.prop = wl.tempProp;
-    return wl.prop;
-}
-
-std::string
-BFSVisitedWorkload::printWorkListItem(const WorkListItem wl)
-{
-    return csprintf(
-            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
-            );
+BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) {
+    return value;
 }
 
 void
-SSSPWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
+    Addr pkt_addr = pkt->getAddr();
     size_t pkt_size = pkt->getSize();
-    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
-
-    if (pkt->getAddr() == aligned_addr) {
-        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
-        WorkListItem items[num_elements];
-
-        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+    WorkListItem items[num_elements];
 
-        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
-        items[index].tempProp = initValue;
-        if (activeCondition(items[index])) {
-            dir->activate(aligned_addr);
+    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    bool atom_active = false;
+    for (int i = 0; i < num_elements; i++) {
+        WorkListItem new_wl = items[i];
+        new_wl.tempProp = (int) (pkt_addr / sizeof(WorkListItem)) + i;
+        bool vertex_active = activeCondition(new_wl, items[i]);
+        if (vertex_active) {
+            new_wl.activeNow = true;
         }
-        pkt->deleteData();
-        pkt->allocate();
-        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
-    }
-}
+        items[i] = new_wl;
+        atom_active |= vertex_active;
 
-uint32_t
-SSSPWorkload::reduce(uint32_t update, uint32_t value)
-{
-    return std::min(update, value);
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
 }
 
 uint32_t
@@ -210,29 +215,6 @@ SSSPWorkload::propagate(uint32_t value, uint32_t weight)
     return value + weight;
 }
 
-bool
-SSSPWorkload::activeCondition(WorkListItem wl)
-{
-    return (wl.tempProp < wl.prop) && (wl.degree > 0);
-}
-
-uint32_t
-SSSPWorkload::apply(WorkListItem& wl)
-{
-    wl.prop = wl.tempProp;
-    return wl.prop;
-}
-
-std::string
-SSSPWorkload::printWorkListItem(const WorkListItem wl)
-{
-    return csprintf(
-            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
-            );
-}
-
-
 void
 BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
@@ -309,61 +291,4 @@ BSPPRWorkload::printWorkListItem(const WorkListItem wl)
             wl.activeFuture ? "true" : "false");
 }
 
-void
-CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
-{
-    Addr pkt_addr = pkt->getAddr();
-    size_t pkt_size = pkt->getSize();
-    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
-    WorkListItem items[num_elements];
-
-    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
-    bool atom_active = false;
-    for (int i = 0; i < num_elements; i++) {
-        items[i].tempProp = (int) ( pkt_addr / sizeof(WorkListItem)) + i;
-        items[i].prop = -1;
-        atom_active |= activeCondition(items[i]);
-    }
-    if (atom_active) {
-        dir->activate(pkt->getAddr());
-    }
-    pkt->deleteData();
-    pkt->allocate();
-    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
-}
-
-uint32_t
-CCWorkload::reduce(uint32_t update, uint32_t value)
-{
-    return std::min(update, value);
-}
-
-uint32_t
-CCWorkload::propagate(uint32_t value, uint32_t weight)
-{
-    return value;
-}
-
-bool
-CCWorkload::activeCondition(WorkListItem wl)
-{
-    return (wl.tempProp < wl.prop) && (wl.degree > 0);
-}
-
-uint32_t
-CCWorkload::apply(WorkListItem& wl)
-{
-    wl.prop = wl.tempProp;
-    return wl.prop;
-}
-
-std::string
-CCWorkload::printWorkListItem(const WorkListItem wl)
-{
-    return csprintf(
-            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
-            );
-}
-
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index fdd4928e10..fa722a634e 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -78,49 +78,31 @@ class BFSWorkload : public GraphWorkload
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
-class BFSVisitedWorkload : public GraphWorkload
+class BFSVisitedWorkload : public BFSWorkload
 {
-  private:
-    uint64_t initAddr;
-    uint32_t initValue;
-
   public:
-    BFSVisitedWorkload(uint64_t init_addr, uint32_t init_value):
-        initAddr(init_addr), initValue(init_value)
+    BFSVisitedWorkload(Addr init_addr, uint32_t init_value):
+        BFSWorkload(init_addr, init_value)
     {}
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) override;
+};
 
-    ~BFSVisitedWorkload() {}
-
+class CCWorkload : public BFSVisitedWorkload
+{
+  public:
+    CCWorkload(): BFSVisitedWorkload(0, 0) {}
     virtual void init(PacketPtr pkt, WorkDirectory* dir);
-    virtual uint32_t reduce(uint32_t update, uint32_t value);
-    virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual uint32_t apply(WorkListItem& wl);
-    virtual bool activeCondition(WorkListItem wl);
-    virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
-class SSSPWorkload : public GraphWorkload
+class SSSPWorkload : public BFSWorkload
 {
-  private:
-    uint64_t initAddr;
-    uint32_t initValue;
-
   public:
-    SSSPWorkload(uint64_t init_addr, uint32_t init_value):
-        initAddr(init_addr), initValue(init_value)
+    SSSPWorkload(Addr init_addr, uint32_t init_value):
+        BFSWorkload(init_addr, init_value)
     {}
-
-    ~SSSPWorkload() {}
-
-    virtual void init(PacketPtr pkt, WorkDirectory* dir);
-    virtual uint32_t reduce(uint32_t update, uint32_t value);
-    virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual uint32_t apply(WorkListItem& wl);
-    virtual bool activeCondition(WorkListItem wl);
-    virtual std::string printWorkListItem(const WorkListItem wl);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) override;
 };
 
-
 class BSPPRWorkload : public GraphWorkload
 {
   private:
@@ -140,21 +122,28 @@ class BSPPRWorkload : public GraphWorkload
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
-class CCWorkload : public GraphWorkload
-{
-
-  public:
-    CCWorkload() {}
-
-    ~CCWorkload() {}
-
-    virtual void init(PacketPtr pkt, WorkDirectory* dir);
-    virtual uint32_t reduce(uint32_t update, uint32_t value);
-    virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual uint32_t apply(WorkListItem& wl);
-    virtual bool activeCondition(WorkListItem wl);
-    virtual std::string printWorkListItem(const WorkListItem wl);
-};
+// class BSPBCWorkload : public GraphWorkload
+// {
+//   private:
+//     int currentDepth;
+//     Addr initAddr;
+//     uint32_t initValue;
+
+//   public:
+//     BSPBCWorkload(Addr init_addr, uint32_t init_value):
+//         currentDepth(1), initAddr(init_addr), initValue(init_value)
+//     {}
+
+//     ~BSPBCWorkload() {}
+
+//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
+//     virtual uint32_t reduce(uint32_t update, uint32_t value);
+//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
+//     virtual uint32_t apply(WorkListItem& wl);
+//     virtual void interIterationInit(WorkListItem& wl);
+//     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+//     virtual std::string printWorkListItem(const WorkListItem wl);
+// };
 
 }
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index bda2fa3d6a..f3210a8ec3 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -46,6 +46,9 @@ class CenteralController(ClockedObject):
                     PyBindMethod("setBSPMode"),
                     PyBindMethod("createPopCountDirectory"),
                     PyBindMethod("createBFSWorkload"),
+                    PyBindMethod("createBFSVisitedWorkload"),
+                    PyBindMethod("createSSSPWorkload"),
+                    PyBindMethod("createCCWorkload"),
                     PyBindMethod("createPRWorkload"),
                     PyBindMethod("workCount"),
                     PyBindMethod("printAnswerToHostSimout")
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 26e4473b03..8414aee259 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -57,6 +57,24 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
     workload = new BFSWorkload(init_addr, init_value);
 }
 
+void
+CenteralController::createBFSVisitedWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BFSVisitedWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createSSSPWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new SSSPWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createCCWorkload()
+{
+    workload = new CCWorkload();
+}
+
 void
 CenteralController::createPRWorkload(float alpha)
 {
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index ab039e5024..aa3938353d 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -70,6 +70,9 @@ class CenteralController : public ClockedObject
     void createPopCountDirectory(int atoms_per_block);
 
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
+    void createBFSVisitedWorkload(Addr init_addr, uint32_t init_value);
+    void createSSSPWorkload(Addr init_addr, uint32_t init_value);
+    void createCCWorkload();
     void createPRWorkload(float alpha);
 
     void recvDoneSignal();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 09f29a43e4..a8c9a1bcb1 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -411,7 +411,6 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     // bits
     req->setPC(((Addr) 1) << 2);
 
-    // FIXME: MemCmd::UpdateWL
     PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
 
     pkt->allocate();

From aee9d09f4fbf08f7a2c6f4a81957a82546a8f0bf Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Nov 2022 11:15:59 -0800
Subject: [PATCH 237/247] Fixing port proxy bug of limiting size to int.

---
 src/accl/graph/base/graph_workload.cc      |  8 ++------
 src/accl/graph/sega/centeral_controller.cc | 12 +++++++-----
 src/accl/graph/sega/mpu.hh                 |  1 +
 src/mem/port_proxy.cc                      |  6 +++---
 src/mem/port_proxy.hh                      | 18 +++++++++---------
 src/mem/translating_port_proxy.cc          |  6 +++---
 src/mem/translating_port_proxy.hh          |  6 +++---
 7 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 7471e4d073..38f11778b6 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -193,13 +193,9 @@ CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
     for (int i = 0; i < num_elements; i++) {
         WorkListItem new_wl = items[i];
         new_wl.tempProp = (int) (pkt_addr / sizeof(WorkListItem)) + i;
-        bool vertex_active = activeCondition(new_wl, items[i]);
-        if (vertex_active) {
-            new_wl.activeNow = true;
-        }
+        new_wl.activeNow = activeCondition(new_wl, items[i]);
+        atom_active |= new_wl.activeNow;
         items[i] = new_wl;
-        atom_active |= vertex_active;
-
     }
     if (atom_active) {
         dir->activate(pkt->getAddr());
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 8414aee259..970a0572c5 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -101,6 +101,7 @@ CenteralController::createPopCountDirectory(int atoms_per_block)
 void
 CenteralController::startup()
 {
+    unsigned int vertex_atom = mpuVector.front()->vertexAtomSize();
     for (auto mpu: mpuVector) {
         addrRangeListMap[mpu] = mpu->getAddrRanges();
         mpu->setProcessingMode(mode);
@@ -126,7 +127,7 @@ CenteralController::startup()
                 mpu->recvFunctional(pkt);
             }
         }
-    }, system->cacheLineSize());
+    }, vertex_atom);
 
     panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image.");
 
@@ -190,18 +191,19 @@ CenteralController::workCount()
 void
 CenteralController::printAnswerToHostSimout()
 {
-    int num_items = system->cacheLineSize() / sizeof(WorkListItem);
+    unsigned int vertex_atom = mpuVector.front()->vertexAtomSize();
+    int num_items = vertex_atom / sizeof(WorkListItem);
     WorkListItem items[num_items];
-    for (Addr addr = 0; addr < maxVertexAddr; addr += system->cacheLineSize())
+    for (Addr addr = 0; addr < maxVertexAddr; addr += vertex_atom)
     {
-        PacketPtr pkt = createReadPacket(addr, system->cacheLineSize());
+        PacketPtr pkt = createReadPacket(addr, vertex_atom);
         for (auto mpu: mpuVector) {
             AddrRangeList range_list = addrRangeListMap[mpu];
             if (contains(range_list, addr)) {
                 mpu->recvFunctional(pkt);
             }
         }
-        pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
+        pkt->writeDataToBlock((uint8_t*) items, vertex_atom);
         for (int i = 0; i < num_items; i++) {
             std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i,
                                         workload->printWorkListItem(items[i]));
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 04393db36d..95d3adeca5 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -67,6 +67,7 @@ class MPU : public SimObject
     void createAsyncPopCountDirectory(int atoms_per_block) { coalesceEngine->createAsyncPopCountDirectory(atoms_per_block); }
     void createBSPPopCountDirectory(int atoms_per_block) { coalesceEngine->createBSPPopCountDirectory(atoms_per_block); }
 
+    unsigned int vertexAtomSize() { return coalesceEngine->params().attached_memory_atom_size; }
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
     void postMemInitSetup() { coalesceEngine->postMemInitSetup(); }
diff --git a/src/mem/port_proxy.cc b/src/mem/port_proxy.cc
index 19e1a53e84..55145ab7d7 100644
--- a/src/mem/port_proxy.cc
+++ b/src/mem/port_proxy.cc
@@ -56,7 +56,7 @@ PortProxy::PortProxy(const RequestPort &port, unsigned int cache_line_size) :
 
 void
 PortProxy::readBlobPhys(Addr addr, Request::Flags flags,
-                        void *p, int size) const
+                        void *p, Addr size) const
 {
     for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done();
          gen.next()) {
@@ -73,7 +73,7 @@ PortProxy::readBlobPhys(Addr addr, Request::Flags flags,
 
 void
 PortProxy::writeBlobPhys(Addr addr, Request::Flags flags,
-                         const void *p, int size) const
+                         const void *p, Addr size) const
 {
     for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done();
          gen.next()) {
@@ -90,7 +90,7 @@ PortProxy::writeBlobPhys(Addr addr, Request::Flags flags,
 
 void
 PortProxy::memsetBlobPhys(Addr addr, Request::Flags flags,
-                          uint8_t v, int size) const
+                          uint8_t v, Addr size) const
 {
     // quick and dirty...
     uint8_t *buf = new uint8_t[size];
diff --git a/src/mem/port_proxy.hh b/src/mem/port_proxy.hh
index 29f6ba60a4..8cd21322ea 100644
--- a/src/mem/port_proxy.hh
+++ b/src/mem/port_proxy.hh
@@ -120,19 +120,19 @@ class PortProxy : FunctionalRequestProtocol
      * Read size bytes memory at physical address and store in p.
      */
     void readBlobPhys(Addr addr, Request::Flags flags,
-                      void *p, int size) const;
+                      void *p, Addr size) const;
 
     /**
      * Write size bytes from p to physical address.
      */
     void writeBlobPhys(Addr addr, Request::Flags flags,
-                       const void *p, int size) const;
+                       const void *p, Addr size) const;
 
     /**
      * Fill size bytes starting at physical addr with byte value val.
      */
     void memsetBlobPhys(Addr addr, Request::Flags flags,
-                        uint8_t v, int size) const;
+                        uint8_t v, Addr size) const;
 
 
 
@@ -143,7 +143,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryReadBlob(Addr addr, void *p, int size) const
+    tryReadBlob(Addr addr, void *p, Addr size) const
     {
         readBlobPhys(addr, 0, p, size);
         return true;
@@ -154,7 +154,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryWriteBlob(Addr addr, const void *p, int size) const
+    tryWriteBlob(Addr addr, const void *p, Addr size) const
     {
         writeBlobPhys(addr, 0, p, size);
         return true;
@@ -165,7 +165,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryMemsetBlob(Addr addr, uint8_t val, int size) const
+    tryMemsetBlob(Addr addr, uint8_t val, Addr size) const
     {
         memsetBlobPhys(addr, 0, val, size);
         return true;
@@ -179,7 +179,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryReadBlob, but insists on success.
      */
     void
-    readBlob(Addr addr, void *p, int size) const
+    readBlob(Addr addr, void *p, Addr size) const
     {
         if (!tryReadBlob(addr, p, size))
             fatal("readBlob(%#x, ...) failed", addr);
@@ -189,7 +189,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryWriteBlob, but insists on success.
      */
     void
-    writeBlob(Addr addr, const void *p, int size) const
+    writeBlob(Addr addr, const void *p, Addr size) const
     {
         if (!tryWriteBlob(addr, p, size))
             fatal("writeBlob(%#x, ...) failed", addr);
@@ -199,7 +199,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryMemsetBlob, but insists on success.
      */
     void
-    memsetBlob(Addr addr, uint8_t v, int size) const
+    memsetBlob(Addr addr, uint8_t v, Addr size) const
     {
         if (!tryMemsetBlob(addr, v, size))
             fatal("memsetBlob(%#x, ...) failed", addr);
diff --git a/src/mem/translating_port_proxy.cc b/src/mem/translating_port_proxy.cc
index 8ab859f40d..bc698c1a07 100644
--- a/src/mem/translating_port_proxy.cc
+++ b/src/mem/translating_port_proxy.cc
@@ -86,7 +86,7 @@ TranslatingPortProxy::tryOnBlob(BaseMMU::Mode mode, TranslationGenPtr gen,
 }
 
 bool
-TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const
+TranslatingPortProxy::tryReadBlob(Addr addr, void *p, Addr size) const
 {
     constexpr auto mode = BaseMMU::Read;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
@@ -99,7 +99,7 @@ TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const
 
 bool
 TranslatingPortProxy::tryWriteBlob(
-        Addr addr, const void *p, int size) const
+        Addr addr, const void *p, Addr size) const
 {
     constexpr auto mode = BaseMMU::Write;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
@@ -111,7 +111,7 @@ TranslatingPortProxy::tryWriteBlob(
 }
 
 bool
-TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, int size) const
+TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, Addr size) const
 {
     constexpr auto mode = BaseMMU::Write;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
diff --git a/src/mem/translating_port_proxy.hh b/src/mem/translating_port_proxy.hh
index bedb57a3ce..7e619784b1 100644
--- a/src/mem/translating_port_proxy.hh
+++ b/src/mem/translating_port_proxy.hh
@@ -77,16 +77,16 @@ class TranslatingPortProxy : public PortProxy
 
     /** Version of tryReadblob that translates virt->phys and deals
       * with page boundries. */
-    bool tryReadBlob(Addr addr, void *p, int size) const override;
+    bool tryReadBlob(Addr addr, void *p, Addr size) const override;
 
     /** Version of tryWriteBlob that translates virt->phys and deals
       * with page boundries. */
-    bool tryWriteBlob(Addr addr, const void *p, int size) const override;
+    bool tryWriteBlob(Addr addr, const void *p, Addr size) const override;
 
     /**
      * Fill size bytes starting at addr with byte value val.
      */
-    bool tryMemsetBlob(Addr address, uint8_t  v, int size) const override;
+    bool tryMemsetBlob(Addr address, uint8_t  v, Addr size) const override;
 };
 
 } // namespace gem5

From eb22da3749dbb7f17e1464c912cb6314e6cb414b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Nov 2022 11:31:29 -0800
Subject: [PATCH 238/247] Fixing postConsumeProcess.

---
 src/accl/graph/sega/coalesce_engine.cc | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 263e08d901..4fa400a63a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -115,7 +115,9 @@ void
 CoalesceEngine::postConsumeProcess()
 {
     WorkListItem items[numElementsPerLine];
-    for (Addr addr = 0; addr <= lastAtomAddr; addr += peerMemoryAtomSize) {
+    Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr);
+    for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) {
+        Addr addr = peerMemoryRange.addIntlvBits(local_addr);
         int block_index = getBlockIndex(addr);
         if (cacheBlocks[block_index].addr == addr) {
             assert(cacheBlocks[block_index].valid);
@@ -125,11 +127,6 @@ CoalesceEngine::postConsumeProcess()
             bool atom_active_future_after = false;
             for (int index = 0; index < numElementsPerLine; index++) {
                 assert(!cacheBlocks[block_index].items[index].activeNow);
-                // if (cacheBlocks[block_index].items[index].activeFuture) {
-                //     graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]);
-                //     cacheBlocks[block_index].items[index].activeNow = true;
-                //     cacheBlocks[block_index].items[index].activeFuture = false;
-                // }
                 atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture;
                 graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]);
                 atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture;

From 1acdbb465257bf3f57ab9b4ff2de31fc4bd8fde0 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Nov 2022 18:18:28 -0800
Subject: [PATCH 239/247] Addding BC.

---
 src/accl/graph/base/graph_workload.cc      | 157 +++++++++++++--------
 src/accl/graph/base/graph_workload.hh      |  52 ++++---
 src/accl/graph/sega/centeral_controller.cc |  10 ++
 3 files changed, 140 insertions(+), 79 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 38f11778b6..6ac2018629 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -118,63 +118,6 @@ BFSWorkload::printWorkListItem(const WorkListItem wl)
             wl.activeFuture ? "true" : "false");
 }
 
-// void
-// BFSVisitedWorkload::init(PacketPtr pkt, WorkDirectory* dir)
-// {
-//     size_t pkt_size = pkt->getSize();
-//     uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
-
-//     if (pkt->getAddr() == aligned_addr) {
-//         int num_elements = (int) (pkt_size / sizeof(WorkListItem));
-//         WorkListItem items[num_elements];
-
-//         pkt->writeDataToBlock((uint8_t*) items, pkt_size);
-
-//         int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
-//         items[index].tempProp = initValue;
-//         if (activeCondition(items[index])) {
-//             dir->activate(aligned_addr);
-//         }
-//         pkt->deleteData();
-//         pkt->allocate();
-//         pkt->setDataFromBlock((uint8_t*) items, pkt_size);
-//     }
-// }
-
-// uint32_t
-// BFSVisitedWorkload::reduce(uint32_t update, uint32_t value)
-// {
-//     return std::min(update, value);
-// }
-
-// uint32_t
-// BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight)
-// {
-//     return 1;
-// }
-
-// bool
-// BFSVisitedWorkload::activeCondition(WorkListItem wl)
-// {
-//     return (wl.tempProp < wl.prop) && (wl.degree > 0);
-// }
-
-// uint32_t
-// BFSVisitedWorkload::apply(WorkListItem& wl)
-// {
-//     wl.prop = wl.tempProp;
-//     return wl.prop;
-// }
-
-// std::string
-// BFSVisitedWorkload::printWorkListItem(const WorkListItem wl)
-// {
-//     return csprintf(
-//             "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-//             wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
-//             );
-// }
-
 uint32_t
 BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) {
     return value;
@@ -287,4 +230,104 @@ BSPPRWorkload::printWorkListItem(const WorkListItem wl)
             wl.activeFuture ? "true" : "false");
 }
 
+void
+BSPBCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    int pkt_size = pkt->getSize();
+    int aligned_addr = roundDown<uint32_t, size_t>(initAddr, pkt_size);
+
+    if (aligned_addr == pkt->getAddr()) {
+        int num_elements = pkt_size / sizeof(WorkListItem);
+        WorkListItem items[num_elements];
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+        int index = (initAddr - aligned_addr) / sizeof(WorkListItem);
+        WorkListItem new_wl = items[index];
+        uint32_t prop = 0;
+        prop |= initValue;
+        // NOTE: Depth of the initial vertex is 0.
+        prop &= (4294967295U >> 8);
+        new_wl.tempProp = prop;
+        new_wl.prop = prop;
+        if (activeCondition(new_wl, items[index])) {
+            new_wl.activeNow = true;
+            dir->activate(aligned_addr);
+        }
+        items[index] = new_wl;
+
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+    }
+}
+
+uint32_t
+BSPBCWorkload::reduce(uint32_t update, uint32_t value)
+{
+    uint32_t update_depth = (update & depthMask) >> 24;
+    uint32_t update_count = (update & countMask);
+    assert(update_depth == (currentDepth - 1));
+    uint32_t value_depth = (value & depthMask) >> 24;
+    uint32_t value_count = (value & countMask);
+    if (value_depth == 255) {
+        value_depth = update_depth;
+        value_count = 0;
+    }
+    if (value_depth == currentDepth) {
+        value_count += update_count;
+    }
+    uint32_t ret = 0;
+    ret |= value_count;
+    warn_if(value_count > 16777215, "value count has grown bigger than 16777125."
+                                " This means the algorithm result might not be correct."
+                                " However, the traversal will not be affected."
+                                " Therefore, performane metrics could be used.");
+    // HACK: Make sure to always set the depth correctly even if count
+    // exceeds the 2^24-1 limit. Here we reset the depth section of ret.
+    ret &= (4294967295U >> 8);
+    // NOTE: Now that the depth is securely reset we can copy the correct value.
+    ret |= (value_depth << 24);
+    return ret;
+}
+
+uint32_t
+BSPBCWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value;
+}
+
+uint32_t
+BSPBCWorkload::apply(WorkListItem& wl)
+{
+    return wl.prop;
+}
+
+void
+BSPBCWorkload::interIterationInit(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+}
+
+bool
+BSPBCWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    uint32_t depth = (new_wl.tempProp & depthMask) >> 24;
+    return (depth == currentDepth);
+}
+
+std::string
+BSPBCWorkload::printWorkListItem(WorkListItem wl)
+{
+    uint32_t temp_depth = (wl.tempProp & depthMask) >> 24;
+    uint32_t temp_count = (wl.tempProp & countMask);
+    uint32_t depth = (wl.prop & depthMask) >> 24;
+    uint32_t count = (wl.prop & countMask);
+    return csprintf(
+            "WorkListItem{tempProp: (depth: %d, count: %d), "
+            "prop: (depth: %d, count: %d), degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_depth, temp_count, depth, count, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index fa722a634e..4ed3dcf3ac 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -51,6 +51,7 @@ class GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual uint32_t apply(WorkListItem& wl) = 0;
+    virtual void iterate() = 0;
     virtual void interIterationInit(WorkListItem& wl) = 0;
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0;
     virtual std::string printWorkListItem(const WorkListItem wl) = 0;
@@ -73,6 +74,7 @@ class BFSWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() {}
     virtual void interIterationInit(WorkListItem& wl) {}
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
@@ -117,33 +119,39 @@ class BSPPRWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() {}
     virtual void interIterationInit(WorkListItem& wl);
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
-// class BSPBCWorkload : public GraphWorkload
-// {
-//   private:
-//     int currentDepth;
-//     Addr initAddr;
-//     uint32_t initValue;
-
-//   public:
-//     BSPBCWorkload(Addr init_addr, uint32_t init_value):
-//         currentDepth(1), initAddr(init_addr), initValue(init_value)
-//     {}
-
-//     ~BSPBCWorkload() {}
-
-//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
-//     virtual uint32_t reduce(uint32_t update, uint32_t value);
-//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
-//     virtual uint32_t apply(WorkListItem& wl);
-//     virtual void interIterationInit(WorkListItem& wl);
-//     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
-//     virtual std::string printWorkListItem(const WorkListItem wl);
-// };
+class BSPBCWorkload : public GraphWorkload
+{
+  private:
+    Addr initAddr;
+    uint32_t initValue;
+
+    int currentDepth;
+
+    uint32_t depthMask;
+    uint32_t countMask;
+  public:
+    BSPBCWorkload(Addr init_addr, uint32_t init_value):
+        currentDepth(0), initAddr(init_addr), initValue(init_value),
+        depthMask(4278190080), countMask(16777215)
+    {}
+
+    ~BSPBCWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() { currentDepth++; }
+    virtual void interIterationInit(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
 
 }
 
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 970a0572c5..15062f1465 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -131,6 +131,11 @@ CenteralController::startup()
 
     panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image.");
 
+    // IDEA: Should this be here or after calling start?
+    // Point of iterate here is to set global variables.
+    // At this point, we know that vertex memory has been
+    // initialized and we can initialize global variables.
+    workload->iterate();
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
         if (!mpu->running() && (mpu->workCount() > 0)) {
@@ -170,6 +175,11 @@ CenteralController::recvDoneSignal()
         for (auto mpu: mpuVector) {
             mpu->postConsumeProcess();
             mpu->swapDirectories();
+            // IDEA: Should this be here or after calling start?
+            // Point of iterate here is to update global variables.
+            // At this point, we know that vertex memory has been
+            // updated and we can update global variables.
+            workload->iterate();
             if (!mpu->running() && (mpu->workCount() > 0)) {
                 mpu->start();
             }

From c6af36c8432cd6057cc4b3bbc0a88c007ef557f5 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Nov 2022 20:58:16 -0800
Subject: [PATCH 240/247] Adding BC and degbugging.

---
 configs/accl/{sega-ddr/pr.py => bc.py}     |  18 +-
 configs/accl/bfs.py                        |  20 +-
 configs/accl/{sega-ddr => }/cc.py          |   0
 configs/accl/sega-ddr/bfs.py               | 138 --------------
 configs/accl/sega-ddr/sega.py              | 209 ---------------------
 configs/accl/sega.py                       |  98 +++++++---
 configs/accl/sega_simple.py                |  96 +++++++---
 configs/accl/{sega-ddr => }/sssp.py        |   0
 src/accl/graph/base/graph_workload.cc      |   9 +-
 src/accl/graph/base/graph_workload.hh      |   4 +-
 src/accl/graph/sega/CenteralController.py  |   1 +
 src/accl/graph/sega/centeral_controller.cc |  18 +-
 src/accl/graph/sega/centeral_controller.hh |   1 +
 13 files changed, 195 insertions(+), 417 deletions(-)
 rename configs/accl/{sega-ddr/pr.py => bc.py} (90%)
 rename configs/accl/{sega-ddr => }/cc.py (100%)
 delete mode 100644 configs/accl/sega-ddr/bfs.py
 delete mode 100644 configs/accl/sega-ddr/sega.py
 rename configs/accl/{sega-ddr => }/sssp.py (100%)

diff --git a/configs/accl/sega-ddr/pr.py b/configs/accl/bc.py
similarity index 90%
rename from configs/accl/sega-ddr/pr.py
rename to configs/accl/bc.py
index ea8a103640..074bee73b9 100644
--- a/configs/accl/sega-ddr/pr.py
+++ b/configs/accl/bc.py
@@ -34,10 +34,12 @@
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("iterations", type=int)
     argparser.add_argument("graph", type=str)
-    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("iterations", type=int)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -67,10 +69,12 @@ def get_inputs():
 
     return (
         args.num_gpts,
+        args.num_registers,
         args.cache_size,
         args.graph,
         args.iterations,
-        args.alpha,
+        args.init_addr,
+        args.init_value,
         args.simple,
         args.sample,
         args.verify,
@@ -80,10 +84,12 @@ def get_inputs():
 if __name__ == "__m5_main__":
     (
         num_gpts,
+        num_registers,
         cache_size,
         graph,
         iterations,
-        alpha,
+        init_addr,
+        init_value,
         simple,
         sample,
         verify,
@@ -93,14 +99,14 @@ def get_inputs():
         from sega_simple import SEGA
     else:
         from sega import SEGA
-    system = SEGA(num_gpts, cache_size, graph)
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
 
     system.set_bsp_mode()
     system.create_pop_count_directory(64)
-    system.create_pr_workload(alpha)
+    system.create_bc_workload(init_addr, init_value)
     if sample:
         while True:
             exit_event = m5.simulate(100000000)
diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index ab5de485b1..97f1b5dc21 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -34,10 +34,19 @@
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--visited",
+        dest="visited",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use visitation version of BFS",
+    )
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -67,10 +76,12 @@ def get_inputs():
 
     return (
         args.num_gpts,
+        args.num_registers,
         args.cache_size,
         args.graph,
         args.init_addr,
         args.init_value,
+        args.visited,
         args.simple,
         args.sample,
         args.verify,
@@ -80,10 +91,12 @@ def get_inputs():
 if __name__ == "__m5_main__":
     (
         num_gpts,
+        num_registers,
         cache_size,
         graph,
         init_addr,
         init_value,
+        visited,
         simple,
         sample,
         verify,
@@ -93,14 +106,17 @@ def get_inputs():
         from sega_simple import SEGA
     else:
         from sega import SEGA
-    system = SEGA(num_gpts, cache_size, graph)
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
 
     system.set_async_mode()
     system.create_pop_count_directory(64)
-    system.create_bfs_workload(init_addr, init_value)
+    if visited:
+        system.create_bfs_visited_workload(init_addr, init_value)
+    else:
+        system.create_bfs_workload(init_addr, init_value)
     if sample:
         while True:
             exit_event = m5.simulate(100000000)
diff --git a/configs/accl/sega-ddr/cc.py b/configs/accl/cc.py
similarity index 100%
rename from configs/accl/sega-ddr/cc.py
rename to configs/accl/cc.py
diff --git a/configs/accl/sega-ddr/bfs.py b/configs/accl/sega-ddr/bfs.py
deleted file mode 100644
index 97f1b5dc21..0000000000
--- a/configs/accl/sega-ddr/bfs.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-import m5
-import argparse
-
-from m5.objects import *
-
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("num_registers", type=int)
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-    argparser.add_argument(
-        "--visited",
-        dest="visited",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Use visitation version of BFS",
-    )
-    argparser.add_argument(
-        "--simple",
-        dest="simple",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Use simple memory for vertex",
-    )
-    argparser.add_argument(
-        "--sample",
-        dest="sample",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Sample sim stats every 100us",
-    )
-    argparser.add_argument(
-        "--verify",
-        dest="verify",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Print final answer",
-    )
-
-    args = argparser.parse_args()
-
-    return (
-        args.num_gpts,
-        args.num_registers,
-        args.cache_size,
-        args.graph,
-        args.init_addr,
-        args.init_value,
-        args.visited,
-        args.simple,
-        args.sample,
-        args.verify,
-    )
-
-
-if __name__ == "__m5_main__":
-    (
-        num_gpts,
-        num_registers,
-        cache_size,
-        graph,
-        init_addr,
-        init_value,
-        visited,
-        simple,
-        sample,
-        verify,
-    ) = get_inputs()
-
-    if simple:
-        from sega_simple import SEGA
-    else:
-        from sega import SEGA
-    system = SEGA(num_gpts, num_registers, cache_size, graph)
-    root = Root(full_system=False, system=system)
-
-    m5.instantiate()
-
-    system.set_async_mode()
-    system.create_pop_count_directory(64)
-    if visited:
-        system.create_bfs_visited_workload(init_addr, init_value)
-    else:
-        system.create_bfs_workload(init_addr, init_value)
-    if sample:
-        while True:
-            exit_event = m5.simulate(100000000)
-            print(
-                f"Exited simulation at tick {m5.curTick()} "
-                + f"because {exit_event.getCause()}"
-            )
-            m5.stats.dump()
-            m5.stats.reset()
-            if exit_event.getCause() != "simulate() limit reached":
-                break
-    else:
-        exit_event = m5.simulate()
-        print(
-            f"Exited simulation at tick {m5.curTick()} "
-            + f"because {exit_event.getCause()}"
-        )
-    if verify:
-        system.print_answer()
diff --git a/configs/accl/sega-ddr/sega.py b/configs/accl/sega-ddr/sega.py
deleted file mode 100644
index 8325cf7565..0000000000
--- a/configs/accl/sega-ddr/sega.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from math import log
-from m5.objects import *
-
-
-def interleave_addresses(plain_range, num_channels, cache_line_size):
-    intlv_low_bit = log(cache_line_size, 2)
-    intlv_bits = log(num_channels, 2)
-    ret = []
-    for i in range(num_channels):
-        ret.append(
-            AddrRange(
-                start=plain_range.start,
-                size=plain_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                xorHighBit=0,
-                intlvBits=intlv_bits,
-                intlvMatch=i,
-            )
-        )
-    return ret, intlv_low_bit + intlv_bits - 1
-
-
-class GPT(SubSystem):
-    def __init__(self, register_file_size: int, cache_size: str):
-        super().__init__()
-        self.wl_engine = WLEngine(
-            update_queue_size=64, register_file_size=register_file_size
-        )
-        self.coalesce_engine = CoalesceEngine(
-            attached_memory_atom_size=32,
-            cache_size=cache_size,
-            max_resp_per_cycle=8,
-            pending_pull_limit=64,
-            active_buffer_size=80,
-            post_push_wb_queue_size=64,
-        )
-        self.push_engine = PushEngine(
-            push_req_queue_size=32,
-            attached_memory_atom_size=64,
-            resp_queue_size=4096,
-            max_propagates_per_cycle=8,
-            update_queue_size=32,
-        )
-
-        self.vertex_mem_ctrl = HBMCtrl(
-            dram=HBM_2000_4H_1x64(
-                page_policy="close", read_buffer_size=96, write_buffer_size=96
-            ),
-            dram_2=HBM_2000_4H_1x64(
-                page_policy="close", read_buffer_size=96, write_buffer_size=96
-            ),
-        )
-        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-
-        self.mpu = MPU(
-            wl_engine=self.wl_engine,
-            coalesce_engine=self.coalesce_engine,
-            push_engine=self.push_engine,
-        )
-
-    def getRespPort(self):
-        return self.wl_engine.in_ports
-
-    def setRespPort(self, port):
-        self.wl_engine.in_ports = port
-
-    def getReqPort(self):
-        return self.push_engine.out_ports
-
-    def setReqPort(self, port):
-        self.push_engine.out_ports = port
-
-    def getEdgeMemPort(self):
-        return self.push_engine.mem_port
-
-    def setEdgeMemPort(self, port):
-        self.push_engine.mem_port = port
-
-    def set_vertex_range(self, vertex_ranges):
-        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
-        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
-
-    def set_vertex_pch_bit(self, pch_bit):
-        self.vertex_mem_ctrl.pch_bit = pch_bit
-
-
-class EdgeMemory(SubSystem):
-    def __init__(self, size: str):
-        super(EdgeMemory, self).__init__()
-        self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = "2.4GHz"
-        self.clk_domain.voltage_domain = VoltageDomain()
-
-        self.mem_ctrl = MemCtrl(
-            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
-        )
-        self.xbar = NoncoherentXBar(
-            width=64, frontend_latency=1, forward_latency=1, response_latency=1
-        )
-        self.xbar.mem_side_ports = self.mem_ctrl.port
-
-    def set_image(self, image):
-        self.mem_ctrl.dram.image_file = image
-
-    def getPort(self):
-        return self.xbar.cpu_side_ports
-
-    def setPort(self, port):
-        self.xbar.cpu_side_ports = port
-
-class SEGA(System):
-    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
-        super(SEGA, self).__init__()
-        # num_gpts should be an even power of 2
-        assert num_gpts != 0
-        assert num_gpts % 2 == 0
-        assert (num_gpts & (num_gpts - 1)) == 0
-
-        self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = "2GHz"
-        self.clk_domain.voltage_domain = VoltageDomain()
-        self.cache_line_size = 32
-        self.mem_mode = "timing"
-
-        # Building the CenteralController
-        self.ctrl = CenteralController(vertex_image_file=f"{graph_path}/vertices")
-        # Building the EdgeMemories
-        edge_mem = []
-        for i in range(int(num_gpts/2)):
-            mem = EdgeMemory("16GiB")
-            mem.set_image(f"{graph_path}/edgelist_{i}")
-            edge_mem.append(mem)
-        self.edge_mem = edge_mem
-        # Building the GPTs
-        vertex_ranges, pch_bit = interleave_addresses(
-            AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32
-        )
-        gpts = []
-        for i in range(num_gpts):
-            gpt = GPT(num_registers, cache_size)
-            gpt.set_vertex_range(
-                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
-            )
-            gpt.set_vertex_pch_bit(pch_bit)
-            gpt.setEdgeMemPort(self.edge_mem[i % (int(num_gpts/2))].getPort())
-            gpts.append(gpt)
-        # Creating the interconnect among mpus
-        for gpt_0 in gpts:
-            for gpt_1 in gpts:
-                gpt_0.setReqPort(gpt_1.getRespPort())
-        self.gpts = gpts
-
-        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
-
-    def work_count(self):
-        return self.ctrl.workCount()
-
-    def set_async_mode(self):
-        self.ctrl.setAsyncMode()
-
-    def set_bsp_mode(self):
-        self.ctrl.setBSPMode()
-
-    def create_pop_count_directory(self, atoms_per_block):
-        self.ctrl.createPopCountDirectory(atoms_per_block)
-
-    def create_bfs_workload(self, init_addr, init_value):
-        self.ctrl.createBFSWorkload(init_addr, init_value)
-
-    def create_bfs_visited_workload(self, init_addr, init_value):
-        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
-
-    def create_sssp_workload(self, init_addr, init_value):
-        self.ctrl.createSSSPWorkload(init_addr, init_value)
-
-    def create_cc_workload(self):
-        self.ctrl.createCCWorkload()
-
-    def create_pr_workload(self, alpha):
-        self.ctrl.createPRWorkload(alpha)
-
-    def print_answer(self):
-        self.ctrl.printAnswerToHostSimout()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index b5ce618f7f..32124731d6 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -47,16 +47,17 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 
 
 class GPT(SubSystem):
-    def __init__(
-        self, edge_memory_size: str, cache_size: str):
+    def __init__(self, register_file_size: int, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(update_queue_size=64, register_file_size=64)
+        self.wl_engine = WLEngine(
+            update_queue_size=64, register_file_size=register_file_size
+        )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
             cache_size=cache_size,
             max_resp_per_cycle=8,
-            pending_pull_limit=32,
-            active_buffer_size=64,
+            pending_pull_limit=64,
+            active_buffer_size=80,
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
@@ -68,16 +69,14 @@ def __init__(
         )
 
         self.vertex_mem_ctrl = HBMCtrl(
-            dram=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96),
-            dram_2=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96)
-        )
-
-        self.edge_mem_ctrl = MemCtrl(
-            dram=
-            DDR4_2400_8x8(range=AddrRange(edge_memory_size), in_addr_map=False)
+            dram=HBM_2000_4H_1x64(
+                page_policy="close", read_buffer_size=96, write_buffer_size=96
+            ),
+            dram_2=HBM_2000_4H_1x64(
+                page_policy="close", read_buffer_size=96, write_buffer_size=96
+            ),
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-        self.push_engine.mem_port = self.edge_mem_ctrl.port
 
         self.mpu = MPU(
             wl_engine=self.wl_engine,
@@ -97,6 +96,12 @@ def getReqPort(self):
     def setReqPort(self, port):
         self.push_engine.out_ports = port
 
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
     def set_vertex_range(self, vertex_ranges):
         self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
         self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
@@ -104,32 +109,65 @@ def set_vertex_range(self, vertex_ranges):
     def set_vertex_pch_bit(self, pch_bit):
         self.vertex_mem_ctrl.pch_bit = pch_bit
 
-    def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.dram.image_file = edge_image
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
+        )
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.dram.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
 
 class SEGA(System):
-    def __init__(self, num_mpus, cache_size, graph_path):
+    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         super(SEGA, self).__init__()
+        # num_gpts should be an even power of 2
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = "2GHz"
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
-
+        # Building the CenteralController
+        self.ctrl = CenteralController(vertex_image_file=f"{graph_path}/vertices")
+        # Building the EdgeMemories
+        edge_mem = []
+        for i in range(int(num_gpts/2)):
+            mem = EdgeMemory("16GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
         vertex_ranges, pch_bit = interleave_addresses(
-            AddrRange(start=0, size="4GiB"), 2 * num_mpus, 32
+            AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32
         )
-
         gpts = []
-        for i in range(num_mpus):
-            gpt = GPT("2GiB", cache_size)
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
             gpt.set_vertex_range(
-                [vertex_ranges[i], vertex_ranges[i + num_mpus]]
+                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
             )
             gpt.set_vertex_pch_bit(pch_bit)
-            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
+            gpt.setEdgeMemPort(self.edge_mem[i % (int(num_gpts/2))].getPort())
             gpts.append(gpt)
         # Creating the interconnect among mpus
         for gpt_0 in gpts:
@@ -154,8 +192,20 @@ def create_pop_count_directory(self, atoms_per_block):
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.createCCWorkload()
+
     def create_pr_workload(self, alpha):
         self.ctrl.createPRWorkload(alpha)
 
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.createBCWorkload(init_addr, init_value)
+
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index ff97134b47..ff567b57e3 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -47,16 +47,17 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 
 
 class GPT(SubSystem):
-    def __init__(
-        self, edge_memory_size: str, cache_size: str):
+    def __init__(self, register_file_size: int, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(update_queue_size=64, register_file_size=64)
+        self.wl_engine = WLEngine(
+            update_queue_size=64, register_file_size=register_file_size
+        )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
             cache_size=cache_size,
             max_resp_per_cycle=8,
-            pending_pull_limit=32,
-            active_buffer_size=64,
+            pending_pull_limit=64,
+            active_buffer_size=80,
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
@@ -67,14 +68,10 @@ def __init__(
             update_queue_size=32,
         )
 
-        self.vertex_mem_ctrl = SimpleMemory(latency="122ns", latency_var="0ns", bandwidth="28GiB/s")
-
-        self.edge_mem_ctrl = MemCtrl(
-            dram=DDR4_2400_8x8(
-                range=AddrRange(edge_memory_size), in_addr_map=False)
+        self.vertex_mem_ctrl = SimpleMemory(
+            latency="122ns", latency_var="0ns", bandwidth="28GiB/s"
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-        self.push_engine.mem_port = self.edge_mem_ctrl.port
 
         self.mpu = MPU(
             wl_engine=self.wl_engine,
@@ -94,32 +91,77 @@ def getReqPort(self):
     def setReqPort(self, port):
         self.push_engine.out_ports = port
 
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.range = vertex_range
 
-    def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.dram.image_file = edge_image
+
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
+        )
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.dram.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
 
 class SEGA(System):
-    def __init__(self, num_mpus, cache_size, graph_path):
+    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         super(SEGA, self).__init__()
+        # num_gpts should be an even power of 2
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = "2GHz"
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
-
+        # Building the CenteralController
+        self.ctrl = CenteralController(
+            vertex_image_file=f"{graph_path}/vertices"
+        )
+        # Building the EdgeMemories
+        edge_mem = []
+        for i in range(int(num_gpts / 2)):
+            mem = EdgeMemory("16GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
         vertex_ranges = interleave_addresses(
-            AddrRange(start=0, size="4GiB"), num_mpus, 32
+            AddrRange(start=0, size="4GiB"), num_gpts, 32
         )
-
         gpts = []
-        for i in range(num_mpus):
-            gpt = GPT("2GiB", cache_size)
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
-            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
+            gpt.setEdgeMemPort(
+                self.edge_mem[i % (int(num_gpts / 2))].getPort()
+            )
             gpts.append(gpt)
         # Creating the interconnect among mpus
         for gpt_0 in gpts:
@@ -144,8 +186,20 @@ def create_pop_count_directory(self, atoms_per_block):
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.createCCWorkload()
+
     def create_pr_workload(self, alpha):
         self.ctrl.createPRWorkload(alpha)
 
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.createBCWorkload(init_addr, init_value)
+
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
diff --git a/configs/accl/sega-ddr/sssp.py b/configs/accl/sssp.py
similarity index 100%
rename from configs/accl/sega-ddr/sssp.py
rename to configs/accl/sssp.py
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 6ac2018629..7bcd447b8e 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -245,7 +245,7 @@ BSPBCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
         uint32_t prop = 0;
         prop |= initValue;
         // NOTE: Depth of the initial vertex is 0.
-        prop &= (4294967295U >> 8);
+        prop &= countMask;
         new_wl.tempProp = prop;
         new_wl.prop = prop;
         if (activeCondition(new_wl, items[index])) {
@@ -265,11 +265,10 @@ BSPBCWorkload::reduce(uint32_t update, uint32_t value)
 {
     uint32_t update_depth = (update & depthMask) >> 24;
     uint32_t update_count = (update & countMask);
-    assert(update_depth == (currentDepth - 1));
     uint32_t value_depth = (value & depthMask) >> 24;
     uint32_t value_count = (value & countMask);
     if (value_depth == 255) {
-        value_depth = update_depth;
+        value_depth = currentDepth;
         value_count = 0;
     }
     if (value_depth == currentDepth) {
@@ -283,7 +282,7 @@ BSPBCWorkload::reduce(uint32_t update, uint32_t value)
                                 " Therefore, performane metrics could be used.");
     // HACK: Make sure to always set the depth correctly even if count
     // exceeds the 2^24-1 limit. Here we reset the depth section of ret.
-    ret &= (4294967295U >> 8);
+    ret &= countMask;
     // NOTE: Now that the depth is securely reset we can copy the correct value.
     ret |= (value_depth << 24);
     return ret;
@@ -311,7 +310,7 @@ bool
 BSPBCWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
 {
     uint32_t depth = (new_wl.tempProp & depthMask) >> 24;
-    return (depth == currentDepth);
+    return (depth == currentDepth) && (new_wl.degree > 0);
 }
 
 std::string
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 4ed3dcf3ac..5a55ad4cdc 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -137,8 +137,8 @@ class BSPBCWorkload : public GraphWorkload
     uint32_t countMask;
   public:
     BSPBCWorkload(Addr init_addr, uint32_t init_value):
-        currentDepth(0), initAddr(init_addr), initValue(init_value),
-        depthMask(4278190080), countMask(16777215)
+        initAddr(init_addr), initValue(init_value),
+        currentDepth(0), depthMask(4278190080), countMask(16777215)
     {}
 
     ~BSPBCWorkload() {}
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index f3210a8ec3..7e16b7e7de 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -50,6 +50,7 @@ class CenteralController(ClockedObject):
                     PyBindMethod("createSSSPWorkload"),
                     PyBindMethod("createCCWorkload"),
                     PyBindMethod("createPRWorkload"),
+                    PyBindMethod("createBCWorkload"),
                     PyBindMethod("workCount"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 15062f1465..86b9ea2b02 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -81,6 +81,12 @@ CenteralController::createPRWorkload(float alpha)
     workload = new BSPPRWorkload(alpha);
 }
 
+void
+CenteralController::createBCWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BSPBCWorkload(init_addr, init_value);
+}
+
 void
 CenteralController::createPopCountDirectory(int atoms_per_block)
 {
@@ -131,17 +137,13 @@ CenteralController::startup()
 
     panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image.");
 
-    // IDEA: Should this be here or after calling start?
-    // Point of iterate here is to set global variables.
-    // At this point, we know that vertex memory has been
-    // initialized and we can initialize global variables.
-    workload->iterate();
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
         if (!mpu->running() && (mpu->workCount() > 0)) {
             mpu->start();
         }
     }
+    workload->iterate();
 }
 
 PacketPtr
@@ -175,15 +177,11 @@ CenteralController::recvDoneSignal()
         for (auto mpu: mpuVector) {
             mpu->postConsumeProcess();
             mpu->swapDirectories();
-            // IDEA: Should this be here or after calling start?
-            // Point of iterate here is to update global variables.
-            // At this point, we know that vertex memory has been
-            // updated and we can update global variables.
-            workload->iterate();
             if (!mpu->running() && (mpu->workCount() > 0)) {
                 mpu->start();
             }
         }
+        workload->iterate();
         exitSimLoopNow("finished an iteration.");
     }
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index aa3938353d..ba829061b5 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -74,6 +74,7 @@ class CenteralController : public ClockedObject
     void createSSSPWorkload(Addr init_addr, uint32_t init_value);
     void createCCWorkload();
     void createPRWorkload(float alpha);
+    void createBCWorkload(Addr init_addr, uint32_t init_value);
 
     void recvDoneSignal();
 

From 787f7f4f45ffeb9e312f4a9000f58742552b555d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Nov 2022 21:03:16 -0800
Subject: [PATCH 241/247] Fixing BC run script.

---
 configs/accl/bc.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/configs/accl/bc.py b/configs/accl/bc.py
index 074bee73b9..56faeb3e4d 100644
--- a/configs/accl/bc.py
+++ b/configs/accl/bc.py
@@ -37,7 +37,6 @@ def get_inputs():
     argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph", type=str)
-    argparser.add_argument("iterations", type=int)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
     argparser.add_argument(
@@ -72,7 +71,6 @@ def get_inputs():
         args.num_registers,
         args.cache_size,
         args.graph,
-        args.iterations,
         args.init_addr,
         args.init_value,
         args.simple,
@@ -87,7 +85,6 @@ def get_inputs():
         num_registers,
         cache_size,
         graph,
-        iterations,
         init_addr,
         init_value,
         simple,
@@ -119,16 +116,16 @@ def get_inputs():
             if exit_event.getCause() != "simulate() limit reached":
                 break
     else:
-        iteration = 0
-        while iteration < iterations:
+        iterations = 0
+        while True:
             exit_event = m5.simulate()
             print(
                 f"Exited simulation at tick {m5.curTick()} "
                 + f"because {exit_event.getCause()}"
             )
-            iteration += 1
+            iterations += 1
             if system.work_count() == 0:
                 break
-    print(f"#iterations: {iteration}")
+    print(f"#iterations: {iterations}")
     if verify:
         system.print_answer()

From b13d005fcb65f7d9e6d97ecc6285044055efa7d7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 16 Nov 2022 22:54:39 -0800
Subject: [PATCH 242/247] Fixing dirty issue in bsp.

---
 configs/accl/sega.py                   | 2 +-
 configs/accl/sega_simple.py            | 2 +-
 src/accl/graph/sega/coalesce_engine.cc | 5 +++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 32124731d6..672151ceed 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -152,7 +152,7 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         # Building the EdgeMemories
         edge_mem = []
         for i in range(int(num_gpts/2)):
-            mem = EdgeMemory("16GiB")
+            mem = EdgeMemory("4GiB")
             mem.set_image(f"{graph_path}/edgelist_{i}")
             edge_mem.append(mem)
         self.edge_mem = edge_mem
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index ff567b57e3..06908d08d3 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -147,7 +147,7 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         # Building the EdgeMemories
         edge_mem = []
         for i in range(int(num_gpts / 2)):
-            mem = EdgeMemory("16GiB")
+            mem = EdgeMemory("4GiB")
             mem.set_image(f"{graph_path}/edgelist_{i}")
             edge_mem.append(mem)
         self.edge_mem = edge_mem
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 4fa400a63a..a2d4378377 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -114,7 +114,6 @@ CoalesceEngine::postMemInitSetup()
 void
 CoalesceEngine::postConsumeProcess()
 {
-    WorkListItem items[numElementsPerLine];
     Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr);
     for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) {
         Addr addr = peerMemoryRange.addIntlvBits(local_addr);
@@ -133,6 +132,7 @@ CoalesceEngine::postConsumeProcess()
                 if (cacheBlocks[block_index].items[index].activeFuture) {
                     cacheBlocks[block_index].items[index].activeFuture = false;
                     cacheBlocks[block_index].items[index].activeNow = true;
+                    cacheBlocks[block_index].dirty = true;
                 }
             }
             if (!atom_active_future_before && atom_active_future_after) {
@@ -142,10 +142,10 @@ CoalesceEngine::postConsumeProcess()
                 futureActiveCacheBlocks.erase(block_index);
             }
         } else {
+            WorkListItem items[numElementsPerLine];
             PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize);
             memPort.sendFunctional(read_pkt);
             read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
-            delete read_pkt;
             bool atom_active_future_before = false;
             bool atom_active_future_after = false;
             for (int index = 0; index < numElementsPerLine; index++) {
@@ -166,6 +166,7 @@ CoalesceEngine::postConsumeProcess()
             }
             PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items);
             memPort.sendFunctional(write_pkt);
+            delete read_pkt;
             delete write_pkt;
         }
     }

From 7861b6a29700aaaf606a6f4b5a47611aea086c87 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 17 Nov 2022 19:26:29 -0800
Subject: [PATCH 243/247] Adding Async PR.

---
 configs/accl/async-pr.py                   | 125 +++++++++++++++++++++
 configs/accl/pr.py                         |   6 +-
 configs/accl/sega.py                       |   6 +
 configs/accl/sega_simple.py                |   3 +
 src/accl/graph/base/graph_workload.cc      |  78 +++++++++++++
 src/accl/graph/base/graph_workload.hh      |  30 ++++-
 src/accl/graph/sega/CenteralController.py  |   2 +
 src/accl/graph/sega/centeral_controller.cc |  13 +++
 src/accl/graph/sega/centeral_controller.hh |   2 +
 src/accl/graph/sega/coalesce_engine.cc     |  23 +++-
 src/accl/graph/sega/coalesce_engine.hh     |   3 +
 src/accl/graph/sega/wl_engine.cc           |   9 ++
 src/accl/graph/sega/wl_engine.hh           |   3 +
 13 files changed, 294 insertions(+), 9 deletions(-)
 create mode 100644 configs/accl/async-pr.py

diff --git a/configs/accl/async-pr.py b/configs/accl/async-pr.py
new file mode 100644
index 0000000000..0bfb6caeaa
--- /dev/null
+++ b/configs/accl/async-pr.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.alpha,
+        args.threshold,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        alpha,
+        threshold,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_async_pr_workload(alpha, threshold)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index ea8a103640..42ae46ea78 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -34,6 +34,7 @@
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("iterations", type=int)
     argparser.add_argument("graph", type=str)
@@ -67,6 +68,7 @@ def get_inputs():
 
     return (
         args.num_gpts,
+        args.num_registers,
         args.cache_size,
         args.graph,
         args.iterations,
@@ -80,6 +82,7 @@ def get_inputs():
 if __name__ == "__m5_main__":
     (
         num_gpts,
+        num_registers,
         cache_size,
         graph,
         iterations,
@@ -93,7 +96,7 @@ def get_inputs():
         from sega_simple import SEGA
     else:
         from sega import SEGA
-    system = SEGA(num_gpts, cache_size, graph)
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
@@ -121,6 +124,7 @@ def get_inputs():
                 + f"because {exit_event.getCause()}"
             )
             iteration += 1
+            print(f"error: {system.get_pr_error()}")
             if system.work_count() == 0:
                 break
     print(f"#iterations: {iteration}")
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 672151ceed..ef23575b9b 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -201,9 +201,15 @@ def create_sssp_workload(self, init_addr, init_value):
     def create_cc_workload(self):
         self.ctrl.createCCWorkload()
 
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.createAsyncPRWorkload(alpha, threshold)
+
     def create_pr_workload(self, alpha):
         self.ctrl.createPRWorkload(alpha)
 
+    def get_pr_error(self):
+        return self.ctrl.getPRError()
+
     def create_bc_workload(self, init_addr, init_value):
         self.ctrl.createBCWorkload(init_addr, init_value)
 
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index 06908d08d3..d6ae8772a5 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -195,6 +195,9 @@ def create_sssp_workload(self, init_addr, init_value):
     def create_cc_workload(self):
         self.ctrl.createCCWorkload()
 
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.createAsyncPRWorkload(alpha, threshold)
+
     def create_pr_workload(self, alpha):
         self.ctrl.createPRWorkload(alpha)
 
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 7bcd447b8e..3a401f0963 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -154,6 +154,81 @@ SSSPWorkload::propagate(uint32_t value, uint32_t weight)
     return value + weight;
 }
 
+void
+PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    int num_elements = pkt->getSize() / sizeof(WorkListItem);
+    WorkListItem items[num_elements];
+    pkt->writeDataToBlock((uint8_t*) items, pkt->getSize());
+
+    bool atom_active = false;
+    for (int index = 0; index < num_elements; index++) {
+        WorkListItem new_wl = items[index];
+        new_wl.tempProp = readFromFloat<uint32_t>(0);
+        new_wl.prop = readFromFloat<uint32_t>(1 - alpha);
+        atom_active |= activeCondition(new_wl, items[index]);
+        items[index] = new_wl;
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt->getSize());
+}
+
+uint32_t
+PRWorkload::reduce(uint32_t update, uint32_t value)
+{
+    float update_float = writeToFloat<uint32_t>(update);
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(update_float + value_float);
+}
+
+uint32_t
+PRWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    float value_float = writeToFloat<uint32_t>(value);
+    float weight_float = writeToFloat<uint32_t>(weight);
+    if (weight == 0) {
+        weight_float = 1.0;
+    }
+    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+}
+
+bool
+PRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    float temp_float = writeToFloat<uint32_t>(new_wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(new_wl.prop);
+    float dist = std::abs(temp_float - prop_float);
+    return (dist >= threshold) && (new_wl.degree > 0);
+}
+
+uint32_t
+PRWorkload::apply(WorkListItem& wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float delta = (temp_float - prop_float) / wl.degree;
+    wl.prop = wl.tempProp;
+    return readFromFloat<uint32_t>(delta);
+}
+
+std::string
+PRWorkload::printWorkListItem(const WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    return csprintf(
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_float, prop_float, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
 void
 BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
@@ -212,6 +287,9 @@ BSPPRWorkload::apply(WorkListItem& wl)
 void
 BSPPRWorkload::interIterationInit(WorkListItem& wl)
 {
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    error += std::abs(temp_float - prop_float);
     wl.prop = wl.tempProp;
     wl.tempProp = readFromFloat<uint32_t>(1 - alpha);
     wl.activeFuture = (wl.degree > 0);
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 5a55ad4cdc..d42bfd0f26 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -105,13 +105,37 @@ class SSSPWorkload : public BFSWorkload
     virtual uint32_t propagate(uint32_t value, uint32_t weight) override;
 };
 
+class PRWorkload : public GraphWorkload
+{
+  private:
+    float alpha;
+    float threshold;
+
+  public:
+    PRWorkload(float alpha, float threshold):
+        alpha(alpha), threshold(threshold)
+    {}
+
+    ~PRWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() {}
+    virtual void interIterationInit(WorkListItem& wl) {};
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
 class BSPPRWorkload : public GraphWorkload
 {
   private:
     float alpha;
+    float error;
 
   public:
-    BSPPRWorkload(float alpha): alpha(alpha) {}
+    BSPPRWorkload(float alpha): alpha(alpha), error(0) {}
 
     ~BSPPRWorkload() {}
 
@@ -119,10 +143,12 @@ class BSPPRWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
-    virtual void iterate() {}
+    virtual void iterate() { error = 0; }
     virtual void interIterationInit(WorkListItem& wl);
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
+
+    float getError() { return error; }
 };
 
 class BSPBCWorkload : public GraphWorkload
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 7e16b7e7de..c5f44c82e9 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -49,8 +49,10 @@ class CenteralController(ClockedObject):
                     PyBindMethod("createBFSVisitedWorkload"),
                     PyBindMethod("createSSSPWorkload"),
                     PyBindMethod("createCCWorkload"),
+                    PyBindMethod("createAsyncPRWorkload"),
                     PyBindMethod("createPRWorkload"),
                     PyBindMethod("createBCWorkload"),
                     PyBindMethod("workCount"),
+                    PyBindMethod("getPRError"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 86b9ea2b02..23eb6bbc0e 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -75,6 +75,12 @@ CenteralController::createCCWorkload()
     workload = new CCWorkload();
 }
 
+void
+CenteralController::createAsyncPRWorkload(float alpha, float threshold)
+{
+    workload = new PRWorkload(alpha, threshold);
+}
+
 void
 CenteralController::createPRWorkload(float alpha)
 {
@@ -196,6 +202,13 @@ CenteralController::workCount()
     return work_count;
 }
 
+float
+CenteralController::getPRError()
+{
+    BSPPRWorkload* pr_workload = dynamic_cast<BSPPRWorkload*>(workload);
+    return pr_workload->getError();
+}
+
 void
 CenteralController::printAnswerToHostSimout()
 {
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index ba829061b5..e73ed22666 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -73,12 +73,14 @@ class CenteralController : public ClockedObject
     void createBFSVisitedWorkload(Addr init_addr, uint32_t init_value);
     void createSSSPWorkload(Addr init_addr, uint32_t init_value);
     void createCCWorkload();
+    void createAsyncPRWorkload(float alpha, float threshold);
     void createPRWorkload(float alpha);
     void createBCWorkload(Addr init_addr, uint32_t init_value);
 
     void recvDoneSignal();
 
     int workCount();
+    float getPRError();
     void printAnswerToHostSimout();
 };
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index a2d4378377..02c98ba640 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -59,6 +59,9 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     nextApplyEvent([this] {
         processNextApplyEvent();
         }, name() + ".nextApplyEvent"),
+    nextDoneSignalEvent([this] {
+        processNextDoneSignalEvent();
+        }, name() + ".nextDoneSignalEvent"),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -552,8 +555,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         }
     }
 
-    if (done()) {
-        owner->recvDoneSignal();
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
     }
     return true;
 }
@@ -712,8 +715,9 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 block_index, cacheBlocks[block_index].to_string());
     stats.numVertexWrites++;
 
-    if ((cacheBlocks[block_index].state == CacheState::IDLE) && done()) {
-        owner->recvDoneSignal();
+    if ((cacheBlocks[block_index].state == CacheState::IDLE) &&
+        done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
     }
 }
 
@@ -749,8 +753,8 @@ CoalesceEngine::processNextMemoryEvent()
         schedule(nextMemoryEvent, nextCycle());
     }
 
-    if (done()) {
-        owner->recvDoneSignal();
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
     }
 }
 
@@ -1170,6 +1174,13 @@ CoalesceEngine::processNextApplyEvent()
     }
 }
 
+void
+CoalesceEngine::processNextDoneSignalEvent()
+{
+    if (done()) {
+        owner->recvDoneSignal();
+    }
+}
 
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 8ee17781fc..b6eec725f9 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -151,6 +151,9 @@ class CoalesceEngine : public BaseMemoryEngine
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
+    EventFunctionWrapper nextDoneSignalEvent;
+    void processNextDoneSignalEvent();
+
     struct CoalesceStats : public statistics::Group
     {
         CoalesceStats(CoalesceEngine &coalesce);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index ed91622b43..d563450179 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -43,6 +43,7 @@ WLEngine::WLEngine(const WLEngineParams& params):
     registerFileSize(params.register_file_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
+    nextDoneSignalEvent([this] { processNextDoneSignalEvent(); }, name()),
     stats(*this)
 {
     for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
@@ -316,6 +317,14 @@ WLEngine::processNextReduceEvent()
     }
     workListFile.clear();
 
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+}
+
+void
+WLEngine::processNextDoneSignalEvent()
+{
     if (done()) {
         owner->recvDoneSignal();
     }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 45baaa1e79..fb147e692a 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -90,6 +90,9 @@ class WLEngine : public BaseReduceEngine
     EventFunctionWrapper nextReduceEvent;
     void processNextReduceEvent();
 
+    EventFunctionWrapper nextDoneSignalEvent;
+    void processNextDoneSignalEvent();
+
     struct WorkListStats : public statistics::Group
     {
       WorkListStats(WLEngine &worklist);

From a991328c22c7dfa6b1b1e03d6d18868c651c3c0e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 17 Nov 2022 20:33:07 -0800
Subject: [PATCH 244/247] Fixing typos.

---
 configs/accl/pr.py                         | 14 ++++++++++++--
 configs/accl/sega.py                       |  4 ++--
 configs/accl/sega_simple.py                |  4 ++--
 src/accl/graph/base/graph_workload.cc      |  6 +++---
 src/accl/graph/base/graph_workload.hh      | 10 +++++++---
 src/accl/graph/sega/centeral_controller.cc |  4 ++--
 src/accl/graph/sega/centeral_controller.hh |  2 +-
 7 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index 42ae46ea78..569514eb82 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -36,9 +36,11 @@ def get_inputs():
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("iterations", type=int)
     argparser.add_argument("graph", type=str)
+    argparser.add_argument("iterations", type=int)
     argparser.add_argument("alpha", type=float)
+    argparser.add_argument("--num_nodes", type=int, default=1)
+    argparser.add_argument("--error_threshold", type=float, default=0.0)
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -73,6 +75,8 @@ def get_inputs():
         args.graph,
         args.iterations,
         args.alpha,
+        args.num_nodes,
+        args.error_threshold,
         args.simple,
         args.sample,
         args.verify,
@@ -87,11 +91,15 @@ def get_inputs():
         graph,
         iterations,
         alpha,
+        num_nodes,
+        error_threshold,
         simple,
         sample,
         verify,
     ) = get_inputs()
 
+    print(f"error_threshold: {error_threshold}")
+
     if simple:
         from sega_simple import SEGA
     else:
@@ -103,7 +111,7 @@ def get_inputs():
 
     system.set_bsp_mode()
     system.create_pop_count_directory(64)
-    system.create_pr_workload(alpha)
+    system.create_pr_workload(num_nodes, alpha)
     if sample:
         while True:
             exit_event = m5.simulate(100000000)
@@ -125,6 +133,8 @@ def get_inputs():
             )
             iteration += 1
             print(f"error: {system.get_pr_error()}")
+            if system.get_pr_error() < error_threshold:
+                break
             if system.work_count() == 0:
                 break
     print(f"#iterations: {iteration}")
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index ef23575b9b..32d0dd26ab 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -204,8 +204,8 @@ def create_cc_workload(self):
     def create_async_pr_workload(self, alpha, threshold):
         self.ctrl.createAsyncPRWorkload(alpha, threshold)
 
-    def create_pr_workload(self, alpha):
-        self.ctrl.createPRWorkload(alpha)
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.createPRWorkload(num_nodes, alpha)
 
     def get_pr_error(self):
         return self.ctrl.getPRError()
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index d6ae8772a5..2d36ec584d 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -198,8 +198,8 @@ def create_cc_workload(self):
     def create_async_pr_workload(self, alpha, threshold):
         self.ctrl.createAsyncPRWorkload(alpha, threshold)
 
-    def create_pr_workload(self, alpha):
-        self.ctrl.createPRWorkload(alpha)
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.createPRWorkload(num_nodes, alpha)
 
     def create_bc_workload(self, init_addr, init_value):
         self.ctrl.createBCWorkload(init_addr, init_value)
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 3a401f0963..ab58b02b73 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -240,8 +240,8 @@ BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
     bool atom_active = false;
     for (int i = 0; i < num_elements; i++) {
         WorkListItem new_wl = items[i];
-        new_wl.tempProp = readFromFloat<uint32_t>(1 - alpha);
-        new_wl.prop = readFromFloat<uint32_t>(1);
+        new_wl.tempProp = readFromFloat<uint32_t>((1 - alpha)/numNodes);
+        new_wl.prop = readFromFloat<uint32_t>(1/numNodes);
         new_wl.activeNow = activeCondition(new_wl, items[i]);
         atom_active |= new_wl.activeNow;
         items[i] = new_wl;
@@ -291,7 +291,7 @@ BSPPRWorkload::interIterationInit(WorkListItem& wl)
     float prop_float = writeToFloat<uint32_t>(wl.prop);
     error += std::abs(temp_float - prop_float);
     wl.prop = wl.tempProp;
-    wl.tempProp = readFromFloat<uint32_t>(1 - alpha);
+    wl.tempProp = readFromFloat<uint32_t>((1 - alpha) / numNodes);
     wl.activeFuture = (wl.degree > 0);
 }
 
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index d42bfd0f26..72748502c1 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -131,11 +131,15 @@ class PRWorkload : public GraphWorkload
 class BSPPRWorkload : public GraphWorkload
 {
   private:
+    int numNodes;
     float alpha;
+    float prevError;
     float error;
 
   public:
-    BSPPRWorkload(float alpha): alpha(alpha), error(0) {}
+    BSPPRWorkload(int num_nodes, float alpha):
+        numNodes(num_nodes), alpha(alpha), prevError(0), error(0)
+    {}
 
     ~BSPPRWorkload() {}
 
@@ -143,12 +147,12 @@ class BSPPRWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
-    virtual void iterate() { error = 0; }
+    virtual void iterate() { prevError = error; error = 0; }
     virtual void interIterationInit(WorkListItem& wl);
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
 
-    float getError() { return error; }
+    float getError() { return prevError; }
 };
 
 class BSPBCWorkload : public GraphWorkload
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 23eb6bbc0e..0aee3b77ce 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -82,9 +82,9 @@ CenteralController::createAsyncPRWorkload(float alpha, float threshold)
 }
 
 void
-CenteralController::createPRWorkload(float alpha)
+CenteralController::createPRWorkload(int num_nodes, float alpha)
 {
-    workload = new BSPPRWorkload(alpha);
+    workload = new BSPPRWorkload(num_nodes, alpha);
 }
 
 void
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index e73ed22666..cce9ac2725 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -74,7 +74,7 @@ class CenteralController : public ClockedObject
     void createSSSPWorkload(Addr init_addr, uint32_t init_value);
     void createCCWorkload();
     void createAsyncPRWorkload(float alpha, float threshold);
-    void createPRWorkload(float alpha);
+    void createPRWorkload(int num_nodes, float alpha);
     void createBCWorkload(Addr init_addr, uint32_t init_value);
 
     void recvDoneSignal();

From da4decf6a2960a7489f1d8450069a9314dae21b0 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 7 Feb 2023 14:03:15 -0800
Subject: [PATCH 245/247] Fixing init in asyncPR.

---
 src/accl/graph/base/graph_workload.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index ab58b02b73..fd802cf275 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -166,7 +166,8 @@ PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
         WorkListItem new_wl = items[index];
         new_wl.tempProp = readFromFloat<uint32_t>(0);
         new_wl.prop = readFromFloat<uint32_t>(1 - alpha);
-        atom_active |= activeCondition(new_wl, items[index]);
+        new_wl.activeNow = activeCondition(new_wl, items[index]);
+        atom_active |= new_wl.activeNow;
         items[index] = new_wl;
     }
     if (atom_active) {

From 7256874c4596608c6721768b3f06a1bd21f16879 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 9 Mar 2023 11:27:37 -0800
Subject: [PATCH 246/247] Improving UniqueFIFO implementation.

---
 src/accl/graph/base/data_structs.hh    | 101 +++++++++++++++++++------
 src/accl/graph/sega/coalesce_engine.cc |   6 ++
 src/accl/graph/sega/push_engine.cc     |   6 +-
 src/accl/graph/sega/work_directory.hh  |   1 +
 4 files changed, 87 insertions(+), 27 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index f09a0dd167..a391e0794d 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -34,7 +34,7 @@
 
 #include <algorithm>
 #include <cassert>
-#include <list>
+#include <deque>
 
 namespace gem5
 {
@@ -137,56 +137,107 @@ template<typename T>
 class UniqueFIFO
 {
   private:
-    std::list<T> fifo;
+    int cap;
+    int pop;
+
+    int* added;
+    int* deleted;
+    std::deque<T> container;
 
   public:
-    UniqueFIFO() {}
+    UniqueFIFO() {
+        cap = 0;
+        pop = 0;
+        added = nullptr;
+        deleted = nullptr;
+    }
 
-    void push_back(T item)
-    {
-        if (!find(item)) {
-            fifo.push_back(item);
+    UniqueFIFO(int size) {
+        cap = size;
+        pop = 0;
+
+        added = (int*) new int [cap];
+        deleted = (int*) new int [cap];
+
+        for (int i = 0; i < cap; i++) {
+            added[i] = 0;
+            deleted[i] = 0;
         }
+        container.clear();
     }
 
-    void pop_front()
-    {
-        assert(!fifo.empty());
-        fifo.pop_front();
+    void fix_front() {
+        while(true) {
+            T elem = container.front();
+            if (deleted[elem] > 0) {
+                deleted[elem]--;
+                added[elem]--;
+                container.pop_front();
+            } else {
+                assert(deleted[elem] == 0);
+                assert(added[elem] == 1);
+                break;
+            }
+        }
     }
 
-    T front()
-    {
-        return fifo.front();
+    T front() {
+        fix_front();
+        return container.front();
     }
 
     size_t size() {
-        return fifo.size();
+        return pop;
     }
 
     void clear() {
-        fifo.clear();
+        pop = 0;
+        for (int i = 0; i < cap; i++) {
+            added[i] = 0;
+            deleted[i] = 0;
+        }
+        container.clear();
     }
 
     bool empty() {
-        return fifo.empty();
+        return size() == 0;
     }
 
     bool find(T item) {
-        // std::list<T>::iterator it = std::find(fifo.begin(), fifo.end(), item);
-        auto it = std::find(fifo.begin(), fifo.end(), item);
-        return (it != fifo.end());
+        assert(added[item] >= 0);
+        assert(deleted[item] >= 0);
+        int diff = added[item] - deleted[item];
+        assert((diff == 0) || (diff == 1));
+        return (diff == 1);
+    }
+
+    void push_back(T item) {
+        if (!find(item)) {
+            added[item]++;
+            pop++;
+            container.push_back(item);
+        }
+    }
+
+    void pop_front() {
+        T elem = front();
+        assert(added[elem] == 1);
+        added[elem] = 0;
+        pop--;
+        container.pop_front();
     }
 
     void erase(T item) {
-        // std::list<T>::iterator it = std::find(fifo.begin(), fifo.end(), item);
-        auto it = std::find(fifo.begin(), fifo.end(), item);
-        assert(it != fifo.end());
-        fifo.erase(it);
+        assert(find(item));
+        deleted[item]++;
+        pop--;
     }
 
     void operator=(const UniqueFIFO<T>& rhs) {
-        fifo = rhs.fifo;
+        pop = rhs.pop;
+        container = rhs.container;
+        added = rhs.added;
+        deleted = rhs.deleted;
     }
 };
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 02c98ba640..8c38341f48 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -69,6 +69,9 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
+    currentActiveCacheBlocks = UniqueFIFO<int>(numLines);
+    futureActiveCacheBlocks = UniqueFIFO<int>(numLines);
+
     activeBuffer.clear();
     postPushWBQueue.clear();
 }
@@ -404,6 +407,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         Addr addr = pkt->getAddr();
         int block_index = getBlockIndex(addr);
         ReadPurpose* purpose = pkt->findNextSenderState<ReadPurpose>();
+        // TODO: delete purpose
 
         // NOTE: Regardless of where the pkt will go we have to release the
         // reserved space for this pkt in the activeBuffer in case
@@ -553,6 +557,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 pullsScheduled++;
             }
         }
+        delete purpose;
     }
 
     if (done() && !nextDoneSignalEvent.scheduled()) {
@@ -999,6 +1004,7 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
 void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
+    DPRINTF(CoalesceEngine, "%s: processNextVertexPull called.\n", __func__);
     pullsScheduled--;
     if (!currentDirectory->empty()) {
         Addr addr = currentDirectory->getNextWork();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a8c9a1bcb1..981b581b7c 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -273,7 +273,9 @@ PushEngine::handleMemResp(PacketPtr pkt)
     // TODO: in case we need to edit edges, get rid of second statement.
     assert(pkt->isResponse() && (!pkt->isWrite()));
 
-    uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize];
+    // uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize];
+    // TODO: Change above line to below line.
+    uint8_t pkt_data [peerMemoryAtomSize];
     PushInfo push_info = reqInfoMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
 
@@ -291,7 +293,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
 
     onTheFlyMemReqs -= push_info.numElements;
     reqInfoMap.erase(pkt->req);
-    delete pkt_data;
+    // delete [] pkt_data;
     delete pkt;
 
     if (!nextPropagateEvent.scheduled()) {
diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh
index 18430aee0d..620e97f654 100644
--- a/src/accl/graph/sega/work_directory.hh
+++ b/src/accl/graph/sega/work_directory.hh
@@ -100,6 +100,7 @@ class PopCountDirectory: public WorkDirectory
         for (int index = 0; index < numCounters; index++) {
             popCount[index] = 0;
         }
+        activeBlockIndices = UniqueFIFO<int>(numCounters);
     }
 
     // CAUTION: This should only be called when the work

From 8673a9d8449bba7cf2dc4734a651fbd10852acd8 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Thu, 9 Mar 2023 14:06:05 -0800
Subject: [PATCH 247/247] Adding asynchronous temporal partitioning

---
 src/accl/graph/sega/MPU.py                 |    5 +-
 src/accl/graph/sega/centeral_controller.cc |   31 +
 src/accl/graph/sega/centeral_controller.hh |    5 +
 src/accl/graph/sega/coalesce_engine.hh     |    4 +
 src/accl/graph/sega/coalesce_engine_s.cc   | 1223 ++++++++++++++++++++
 src/accl/graph/sega/mpu.cc                 |   52 +-
 src/accl/graph/sega/mpu.hh                 |   16 +-
 src/accl/graph/sega/wl_engine.cc           |   14 +
 8 files changed, 1343 insertions(+), 7 deletions(-)
 create mode 100644 src/accl/graph/sega/coalesce_engine_s.cc

diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 8d2453b01c..79fa7db8d0 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -27,9 +27,10 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.SimObject import SimObject
+# from m5.SimObject import SimObject
+from m5.objects.ClockedObject import ClockedObject
 
-class MPU(SimObject):
+class MPU(ClockedObject):
     type = "MPU"
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = "gem5::MPU"
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 0aee3b77ce..fc4bacd414 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -93,6 +93,18 @@ CenteralController::createBCWorkload(Addr init_addr, uint32_t init_value)
     workload = new BSPBCWorkload(init_addr, init_value);
 }
 
+bool
+CenteralController::bufferRemoteUpdate(int slice_number, PacketPtr pkt)
+{
+    for (auto mpu: mpuVector) {
+        if (contains(mpu->getAddrRanges(), pkt->getAddr())) {
+            remoteUpdates[mpu][slice_number].push_back(pkt);
+        }
+    }
+    
+    return true;
+}
+
 void
 CenteralController::createPopCountDirectory(int atoms_per_block)
 {
@@ -173,6 +185,25 @@ CenteralController::recvDoneSignal()
     bool done = true;
     for (auto mpu : mpuVector) {
         done &= mpu->done();
+        int total_num_slices = remoteUpdates[mpu].size();
+        if (mpu->done()) {
+            int slice_number = mpu->getSliceCounter() + 1;
+            while ((total_num_slices != 0) && (slice_number != mpu->getSliceCounter())) {
+                if (!remoteUpdates[mpu][slice_number].empty()) {
+                    mpu->scheduleNewSlice();
+                    mpu->updateSliceCounter(slice_number);
+                    done = false;
+                    break;
+                } 
+                else {
+                    if (slice_number == total_num_slices) {
+                        slice_number = 0;
+                    } else {
+                        slice_number++;
+                    }
+                }
+            }
+        }
     }
 
     if (done && mode == ProcessingMode::ASYNCHRONOUS) {
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index cce9ac2725..6692d999ed 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -77,11 +77,16 @@ class CenteralController : public ClockedObject
     void createPRWorkload(int num_nodes, float alpha);
     void createBCWorkload(Addr init_addr, uint32_t init_value);
 
+    bool bufferRemoteUpdate(int slice_number, PacketPtr pkt);
+    int getnumGPTs() {return mpuVector.size();}
+
     void recvDoneSignal();
 
     int workCount();
     float getPRError();
     void printAnswerToHostSimout();
+    std::unordered_map<MPU*, std::unordered_map<int, std::deque<PacketPtr>>> 
+                                                                remoteUpdates; 
 };
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index b6eec725f9..10a71a7ef1 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -217,6 +217,10 @@ class CoalesceEngine : public BaseMemoryEngine
     ReadReturnStatus recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
+    int getSliceSize() 
+                    {return (int)(params().cache_size); }
+                    // /sizeof(WorkListItem)); }
+
     int workCount();
     int futureWorkCount();
     void recvVertexPull();
diff --git a/src/accl/graph/sega/coalesce_engine_s.cc b/src/accl/graph/sega/coalesce_engine_s.cc
new file mode 100644
index 0000000000..6a5261d38c
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine_s.cc
@@ -0,0 +1,1223 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/coalesce_engine.hh"
+
+#include <bitset>
+
+#include "accl/graph/sega/mpu.hh"
+#include "base/intmath.hh"
+#include "debug/CacheBlockState.hh"
+#include "debug/CoalesceEngine.hh"
+#include "debug/SEGAStructureSize.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+CoalesceEngine::CoalesceEngine(const Params &params):
+    BaseMemoryEngine(params), mode(ProcessingMode::NOT_SET), lastAtomAddr(0),
+    numLines((int) (params.cache_size / peerMemoryAtomSize)),
+    numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
+    onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle),
+    pullsReceived(0), pullsScheduled(0),
+    pendingPullLimit(params.pending_pull_limit),
+    pendingPullReads(0), activeBufferSize(params.active_buffer_size),
+    postPushWBQueueSize(params.post_push_wb_queue_size),
+    nextMemoryEvent([this] {
+        processNextMemoryEvent();
+        }, name() + ".nextMemoryEvent"),
+    nextResponseEvent([this] {
+        processNextResponseEvent();
+        }, name() + ".nextResponseEvent"),
+    nextApplyEvent([this] {
+        processNextApplyEvent();
+        }, name() + ".nextApplyEvent"),
+    nextDoneSignalEvent([this] {
+        processNextDoneSignalEvent();
+        }, name() + ".nextDoneSignalEvent"),
+    stats(*this)
+{
+    assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
+    cacheBlocks = new Block [numLines];
+    for (int i = 0; i < numLines; i++) {
+        cacheBlocks[i] = Block(numElementsPerLine);
+    }
+    activeBuffer.clear();
+    postPushWBQueue.clear();
+}
+
+void
+CoalesceEngine::registerMPU(MPU* mpu)
+{
+    owner = mpu;
+}
+
+
+// NOTE: Used for initializing memory and reading the final answer
+void
+CoalesceEngine::recvFunctional(PacketPtr pkt)
+{
+    if (pkt->isRead()) {
+        assert(pkt->getSize() == peerMemoryAtomSize);
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+
+        if ((cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].valid)) {
+            assert(cacheBlocks[block_index].state == CacheState::IDLE);
+
+            pkt->makeResponse();
+            pkt->setDataFromBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+        } else {
+            memPort.sendFunctional(pkt);
+        }
+    } else {
+        graphWorkload->init(pkt, currentDirectory);
+        if (pkt->getAddr() > lastAtomAddr) {
+            lastAtomAddr = pkt->getAddr();
+        }
+        memPort.sendFunctional(pkt);
+    }
+}
+
+void
+CoalesceEngine::postMemInitSetup()
+{
+    currentDirectory->setLastAtomAddr(lastAtomAddr);
+}
+
+void
+CoalesceEngine::postConsumeProcess()
+{
+    Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr);
+    for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) {
+        Addr addr = peerMemoryRange.addIntlvBits(local_addr);
+        int block_index = getBlockIndex(addr);
+        if (cacheBlocks[block_index].addr == addr) {
+            assert(cacheBlocks[block_index].valid);
+            assert(!cacheBlocks[block_index].hasConflict);
+            assert(cacheBlocks[block_index].state == CacheState::IDLE);
+            bool atom_active_future_before = false;
+            bool atom_active_future_after = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                assert(!cacheBlocks[block_index].items[index].activeNow);
+                atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture;
+                graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]);
+                atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture;
+                if (cacheBlocks[block_index].items[index].activeFuture) {
+                    cacheBlocks[block_index].items[index].activeFuture = false;
+                    cacheBlocks[block_index].items[index].activeNow = true;
+                    cacheBlocks[block_index].dirty = true;
+                }
+            }
+            if (!atom_active_future_before && atom_active_future_after) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
+            if (atom_active_future_before && !atom_active_future_after) {
+                futureActiveCacheBlocks.erase(block_index);
+            }
+        } else {
+            WorkListItem items[numElementsPerLine];
+            PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize);
+            memPort.sendFunctional(read_pkt);
+            read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_future_before = false;
+            bool atom_active_future_after = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                assert(!items[index].activeNow);
+                atom_active_future_before |= items[index].activeFuture;
+                graphWorkload->interIterationInit(items[index]);
+                atom_active_future_after |= items[index].activeFuture;
+                if (items[index].activeFuture) {
+                    items[index].activeFuture = false;
+                    items[index].activeNow = true;
+                }
+            }
+            if (!atom_active_future_before && atom_active_future_after) {
+                futureDirectory->activate(addr);
+            }
+            if (atom_active_future_before && !atom_active_future_after) {
+                futureDirectory->deactivate(addr);
+            }
+            PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items);
+            memPort.sendFunctional(write_pkt);
+            delete read_pkt;
+            delete write_pkt;
+        }
+    }
+}
+
+void
+CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block)
+{
+    currentDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+    futureDirectory = nullptr;
+}
+
+void
+CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block)
+{
+    currentDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+    futureDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+}
+
+void
+CoalesceEngine::swapDirectories()
+{
+    assert(currentDirectory->empty());
+    assert(currentActiveCacheBlocks.empty());
+    // assert currentDirectory is empty
+    WorkDirectory* temp = currentDirectory;
+    currentDirectory = futureDirectory;
+    futureDirectory = temp;
+
+    currentActiveCacheBlocks.clear();
+    currentActiveCacheBlocks = futureActiveCacheBlocks;
+    futureActiveCacheBlocks.clear();
+}
+
+bool
+CoalesceEngine::done()
+{
+    return memoryFunctionQueue.empty() && currentActiveCacheBlocks.empty() &&
+        activeBuffer.empty() && currentDirectory->empty() && (onTheFlyReqs == 0);
+}
+
+bool
+CoalesceEngine::enoughSpace()
+{
+    return (activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize;
+}
+
+bool
+CoalesceEngine::pullCondition()
+{
+    bool enough_space = enoughSpace();
+    bool schedule_limit = pullsScheduled < pendingPullLimit;
+    return enough_space && schedule_limit;
+}
+
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBlockIndex(Addr addr)
+{
+    assert((addr % peerMemoryAtomSize) == 0);
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
+}
+
+
+
+ReadReturnStatus
+CoalesceEngine::recvWLRead(Addr addr)
+{
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+    assert(aligned_addr % peerMemoryAtomSize == 0);
+    int block_index = getBlockIndex(aligned_addr);
+    assert(block_index < numLines);
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    assert(wl_offset < numElementsPerLine);
+    //assert(addr in a right slice)
+    // assert((cacheBlocks[block_index].addr == aligned_addr))
+    DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
+                        "This request maps to cacheBlocks[%d], aligned_addr: "
+                        "%lu, and wl_offset: %d.\n", __func__, addr,
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+
+    if ((cacheBlocks[block_index].addr == aligned_addr) and 
+            (cacheBlocks[block_index].valid)) {
+        // Hit
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
+        stats.readHits++;
+        assert(cacheBlocks[block_index].state != CacheState::INVALID);
+        responseQueue.push_back(std::make_tuple(
+            addr, cacheBlocks[block_index].items[wl_offset], curTick()));
+
+        DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                        cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
+        DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
+        // TODO: Stat to count the number of WLItems that have been touched.
+        cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+        cacheBlocks[block_index].state = CacheState::BUSY;
+        // HACK: If a read happens on the same cycle as another operation such
+        // as apply set lastChangedTick to half a cycle later so that operation
+        // scheduled by the original operation (apply in this example) are
+        // invalidated. For more details refer to "accl/graph/sega/busyMaskErr"
+        cacheBlocks[block_index].lastChangedTick =
+                                    curTick() + (Tick) (clockPeriod() / 2);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+
+        if (!nextResponseEvent.scheduled()) {
+            schedule(nextResponseEvent, nextCycle());
+        }
+        stats.numVertexReads++;
+        return ReadReturnStatus::ACCEPT;
+    } else if (cacheBlocks[block_index].state == CacheState::PENDING_DATA) {
+        // Hit under miss
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit under miss.\n",
+                                                        __func__, addr);
+        stats.readHitUnderMisses++;
+        assert(!cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].dirty);
+
+        assert(MSHR.find(block_index) != MSHR.end());
+        MSHR[block_index].push_back(addr);
+        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to MSHR "
+                "for cacheBlocks[%d].\n", __func__, addr, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+        stats.numVertexReads++;
+        return ReadReturnStatus::ACCEPT;
+    } else {
+    //     // miss
+        assert(cacheBlocks[block_index].addr != aligned_addr);
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a cold miss.\n", 
+                                                                __func__, addr);
+        stats.readMisses++;
+            // cold miss
+            assert(MSHR.find(block_index) == MSHR.end());
+            cacheBlocks[block_index].addr = aligned_addr;
+            cacheBlocks[block_index].busyMask = 0;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].dirty = false;
+            cacheBlocks[block_index].hasConflict = false;
+            cacheBlocks[block_index].state = CacheState::PENDING_DATA;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+
+            MSHR[block_index].push_back(addr);
+            memoryFunctionQueue.emplace_back(
+                [this] (int block_index, Tick schedule_tick) {
+                    processNextRead(block_index, schedule_tick);
+                }, block_index, curTick());
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+            return ReadReturnStatus::ACCEPT;
+        }
+    }
+}
+
+bool
+CoalesceEngine::handleMemResp(PacketPtr pkt)
+{
+    assert(pkt->isResponse());
+    DPRINTF(CoalesceEngine,  "%s: Received packet: %s from memory.\n",
+                                                __func__, pkt->print());
+
+    onTheFlyReqs--;
+    if (pkt->isWrite()) {
+        DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__);
+        delete pkt;
+    } else {
+        assert(pkt->isRead());
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+        ReadPurpose* purpose = pkt->findNextSenderState<ReadPurpose>();
+
+        // NOTE: Regardless of where the pkt will go we have to release the
+        // reserved space for this pkt in the activeBuffer in case
+        // it was read from memory for placement in the activeBuffer.
+        // NOTE: Also we have to stop tracking the address for pullAddrs
+        if (purpose->dest() == ReadDestination::READ_FOR_PUSH) {
+            pendingPullReads--;
+            pendingPullAddrs.erase(addr);
+        }
+        if (cacheBlocks[block_index].addr == addr) {
+            // If it is in the cache, line should be in PENDING_DATA state.
+            // Regardless of the purpose for which it was read, it should
+            // be placed in the cache array.
+            assert(cacheBlocks[block_index].busyMask == 0);
+            assert(!cacheBlocks[block_index].valid);
+            assert(!cacheBlocks[block_index].dirty);
+            assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
+
+            // NOTE: Since it is in PENDING_DATA state it
+            // should have an entry in the MSHR.
+            assert(MSHR.find(block_index) != MSHR.end());
+
+            pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
+                                                            peerMemoryAtomSize);
+
+            cacheBlocks[block_index].valid = true;
+            // HACK: In case the pkt was read for push but it was allocated
+            // for in the cache later on, we should cancel the future
+            // processNextRead for this block. We could set lastChangedTick
+            // to curTick() like usual. However, there is no way to ensure
+            // that processNextRead will be not be called on the same tick
+            // as the pkt arrives from the memory. Therefore, we will set
+            // the lastChangedTick to half a cycle before the actual time.
+            // We move that back in time because it would be fine if
+            // processNextRead happened before pkt arriveed. processNextRead
+            // actually will check if there is a pending read for push for
+            // the address it's trying to populate.
+            if (purpose->dest() == ReadDestination::READ_FOR_PUSH) {
+                cacheBlocks[block_index].lastChangedTick =
+                                    curTick() - (Tick) (clockPeriod() / 2);
+            } else {
+                cacheBlocks[block_index].lastChangedTick = curTick();
+            }
+
+            // NOTE: If the atom is active we have to deactivate the tracking
+            // of this atom in the memory since it's not in memory anymore.
+            // Since it is going to the cache, cache will be responsible for
+            // tracking this. Push to activeCacheBlocks for simulator speed
+            // instead of having to search for active blocks in the cache.
+            bool atom_active_now = false;
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_now) {
+                int count = currentDirectory->deactivate(addr);
+                currentActiveCacheBlocks.push_back(block_index);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
+            }
+            if (atom_active_future) {
+                int count = futureDirectory->deactivate(addr);
+                futureActiveCacheBlocks.push_back(block_index);
+                stats.futureFrontierSize.sample(futureDirectory->workCount());
+                stats.futureBlockActiveCount.sample(count);
+            }
+
+            assert(MSHR.find(block_index) != MSHR.end());
+            for (auto it = MSHR[block_index].begin();
+                                            it != MSHR[block_index].end();) {
+                Addr miss_addr = *it;
+                Addr aligned_miss_addr =
+                            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
+                assert(aligned_miss_addr == cacheBlocks[block_index].addr);
+                int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+                DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                            "cacheBlocks[%d] can be serviced with the received "
+                            "packet.\n",__func__, miss_addr, block_index);
+                responseQueue.push_back(std::make_tuple(miss_addr,
+                        cacheBlocks[block_index].items[wl_offset], curTick()));
+                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, miss_addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+                DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                            block_index, cacheBlocks[block_index].to_string());
+                it = MSHR[block_index].erase(it);
+            }
+            MSHR.erase(block_index);
+
+            cacheBlocks[block_index].state = CacheState::BUSY;
+            if ((!nextResponseEvent.scheduled()) && (!responseQueue.empty())) {
+                schedule(nextResponseEvent, nextCycle());
+            }
+            delete pkt;
+        } else {
+            assert(purpose->dest() == ReadDestination::READ_FOR_PUSH);
+            // There should be enough room in activeBuffer to place this pkt.
+            // REMEMBER: If dest == READ_FOR_PUSH we release the reserved space.
+            // So at this point in code we should have at least one free entry
+            // in the active buffer which is reserved for this pkt.
+            assert(activeBuffer.size() + pendingPullReads < activeBufferSize);
+
+            WorkListItem items[numElementsPerLine];
+            pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_now = false;
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active_now |= items[index].activeNow;
+                atom_active_future |= items[index].activeFuture;
+            }
+            if (atom_active_now) {
+                int count = currentDirectory->deactivate(addr);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
+                if (atom_active_future) {
+                    int count = futureDirectory->deactivate(addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
+                }
+                activeBuffer.emplace_back(pkt, curTick());
+            } else {
+                stats.wastefulBytesRead += pkt->getSize();
+                delete pkt;
+            }
+
+            if (pullCondition()) {
+                memoryFunctionQueue.emplace_back(
+                    [this] (int ignore, Tick schedule_tick) {
+                        processNextVertexPull(ignore, schedule_tick);
+                    }, 0, curTick());
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
+                pullsScheduled++;
+            }
+        }
+    }
+
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+    return true;
+}
+
+void
+CoalesceEngine::processNextResponseEvent()
+{
+    int num_responses_sent = 0;
+
+    Addr addr_response;
+    WorkListItem worklist_response;
+    Tick response_queueing_tick;
+    while(true) {
+        std::tie(addr_response, worklist_response, response_queueing_tick) =
+                                                        responseQueue.front();
+        Tick waiting_ticks = curTick() - response_queueing_tick;
+        if (ticksToCycles(waiting_ticks) < 1) {
+            break;
+        }
+        owner->handleIncomingWL(addr_response, worklist_response);
+        num_responses_sent++;
+        DPRINTF(CoalesceEngine,
+                    "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
+                    __func__,
+                    graphWorkload->printWorkListItem(worklist_response),
+                    addr_response);
+
+        responseQueue.pop_front();
+        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue."
+                    " responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
+                    "responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        stats.responseQueueLatency.sample(
+                                    waiting_ticks * 1e9 / getClockFrequency());
+        if (num_responses_sent >= maxRespPerCycle) {
+            // TODO: Add the condition to check that front of queue can be
+            // sent to WLEngine. i.e. it has at least been in the queue for
+            // one cycle.
+            if (!responseQueue.empty()) {
+                stats.responsePortShortage++;
+            }
+            break;
+        }
+        if (responseQueue.empty()) {
+            break;
+        }
+    }
+
+    if ((!nextResponseEvent.scheduled()) &&
+        (!responseQueue.empty())) {
+        schedule(nextResponseEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
+{
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+    int block_index = getBlockIndex(aligned_addr);
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    DPRINTF(CoalesceEngine,  "%s: Received a write request for addr: %lu with "
+                        "wl: %s. This request maps to cacheBlocks[%d], "
+                        "aligned_addr: %lu, and wl_offset: %d.\n",
+                        __func__, addr, graphWorkload->printWorkListItem(wl),
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
+                "with Addr: %lu.\n", __func__,
+                graphWorkload->printWorkListItem(wl), addr);
+
+    // NOTE: Design does not allow for write misses.
+    assert(cacheBlocks[block_index].addr == aligned_addr);
+    // cache state asserts
+    assert(cacheBlocks[block_index].busyMask != 0);
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].state == CacheState::BUSY);
+
+    // respective bit in busyMask for wl is set.
+    assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
+            (1 << wl_offset));
+
+    if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
+        cacheBlocks[block_index].dirty |= true;
+    }
+
+    bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]);
+    cacheBlocks[block_index].items[wl_offset] = wl;
+    if (mode == ProcessingMode::ASYNCHRONOUS) {
+        cacheBlocks[block_index].items[wl_offset].activeNow |= active;
+        if (active && (!currentActiveCacheBlocks.find(block_index))) {
+            currentActiveCacheBlocks.push_back(block_index);
+            if (!owner->running()) {
+                owner->start();
+            }
+        }
+    }
+    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        cacheBlocks[block_index].items[wl_offset].activeFuture |= active;
+        if (active && (!futureActiveCacheBlocks.find(block_index))) {
+            futureActiveCacheBlocks.push_back(block_index);
+        }
+    }
+
+    cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
+    cacheBlocks[block_index].lastChangedTick = curTick();
+    DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
+                __func__, block_index, wl_offset,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]));
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+
+    if (cacheBlocks[block_index].busyMask == 0) {
+        if (cacheBlocks[block_index].hasConflict) {
+            if (cacheBlocks[block_index].dirty) {
+                cacheBlocks[block_index].state = CacheState::PENDING_WB;
+                cacheBlocks[block_index].lastChangedTick = curTick();
+                memoryFunctionQueue.emplace_back(
+                    [this] (int block_index, Tick schedule_tick) {
+                        processNextWriteBack(block_index, schedule_tick);
+                    }, block_index, curTick());
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
+            } else {
+                bool atom_active_now = false;
+                bool atom_active_future = false;
+                for (int index = 0; index < numElementsPerLine; index++) {
+                    atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                    atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+                }
+                if (atom_active_now) {
+                    currentActiveCacheBlocks.erase(block_index);
+                    int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.currentFrontierSize.sample(currentDirectory->workCount());
+                    stats.currentBlockActiveCount.sample(count);
+                }
+                if (atom_active_future) {
+                    futureActiveCacheBlocks.erase(block_index);
+                    int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
+                }
+                cacheBlocks[block_index].reset();
+            }
+        } else {
+            cacheBlocks[block_index].state = CacheState::IDLE;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+        }
+    }
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    stats.numVertexWrites++;
+
+    if ((cacheBlocks[block_index].state == CacheState::IDLE) &&
+        done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextMemoryEvent()
+{
+    if (memPort.blocked()) {
+        stats.numMemoryBlocks++;
+        nextMemoryEvent.sleep();
+        return;
+    }
+
+    DPRINTF(CoalesceEngine, "%s: Processing another "
+                        "memory function.\n", __func__);
+    std::function<void(int, Tick)> next_memory_function;
+    int next_memory_function_input;
+    Tick next_memory_function_tick;
+    std::tie(
+        next_memory_function,
+        next_memory_function_input,
+        next_memory_function_tick) = memoryFunctionQueue.front();
+    next_memory_function(next_memory_function_input, next_memory_function_tick);
+    memoryFunctionQueue.pop_front();
+    stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick)
+                                                * 1e9 / getClockFrequency());
+    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
+                                "memoryFunctionQueue.size = %d.\n", __func__,
+                                memoryFunctionQueue.size());
+
+    assert(!nextMemoryEvent.pending());
+    assert(!nextMemoryEvent.scheduled());
+    if ((!memoryFunctionQueue.empty())) {
+        schedule(nextMemoryEvent, nextCycle());
+    }
+
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n",
+                                            __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+        __func__, block_index, cacheBlocks[block_index].to_string());
+    // A cache block should not be touched while it's waiting for data.
+    // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
+    // TODO: Figure out if this is still necessary.
+    if (cacheBlocks[block_index].lastChangedTick != schedule_tick) {
+        return;
+    }
+
+    assert(cacheBlocks[block_index].busyMask == 0);
+    assert(!cacheBlocks[block_index].valid);
+    assert(!cacheBlocks[block_index].dirty);
+    assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
+
+    bool need_send_pkt = true;
+
+    // NOTE: Search postPushWBQueue
+    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end();)
+    {
+        PacketPtr wb_pkt = std::get<0>(*wb);
+        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
+            wb_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+            cacheBlocks[block_index].valid = true;
+            cacheBlocks[block_index].dirty = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            // NOTE: If an atom is in the postPushWBQueue,
+            // the it is definitely currently not active.
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++)
+            {
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
+
+            need_send_pkt = false;
+            wb = postPushWBQueue.erase(wb);
+            delete wb_pkt;
+        } else {
+            wb++;
+        }
+    }
+    // NOTE: Search activeBuffer
+    for (auto ab = activeBuffer.begin(); ab != activeBuffer.end();) {
+        PacketPtr ab_pkt = std::get<0>(*ab);
+        if (cacheBlocks[block_index].addr == ab_pkt->getAddr()) {
+            ab_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+
+            cacheBlocks[block_index].valid = true;
+            cacheBlocks[block_index].dirty = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            // If an atom is in the activeBuffer,
+            // then it is definitely currently active.
+            currentActiveCacheBlocks.push_back(block_index);
+            // NOTE: Residence in the activeBuffer does not
+            // signify anything about future activity.
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++)
+            {
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
+
+            need_send_pkt = false;
+            ab = activeBuffer.erase(ab);
+            delete ab_pkt;
+            if (pullCondition()) {
+                memoryFunctionQueue.emplace_back(
+                    [this] (int ignore, Tick schedule_tick) {
+                        processNextVertexPull(ignore, schedule_tick);
+                    }, 0, curTick());
+                pullsScheduled++;
+            }
+        } else {
+            ab++;
+        }
+    }
+    if (!need_send_pkt) {
+        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+            Addr miss_addr = *it;
+            Addr aligned_miss_addr =
+                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+            assert(aligned_miss_addr == cacheBlocks[block_index].addr);
+            int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                        "cacheBlocks[%d] can be serviced with the received "
+                        "packet.\n",__func__, miss_addr, block_index);
+            // TODO: Make this block of code into a function
+            responseQueue.push_back(std::make_tuple(miss_addr,
+                    cacheBlocks[block_index].items[wl_offset], curTick()));
+            DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
+                        responseQueue.size());
+            DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
+                        responseQueue.size());
+            cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+                    __func__, block_index,
+                    cacheBlocks[block_index].to_string());
+            it = MSHR[block_index].erase(it);
+        }
+        assert(MSHR[block_index].empty());
+        MSHR.erase(block_index);
+        if ((!nextResponseEvent.scheduled()) &&
+            (!responseQueue.empty())) {
+            schedule(nextResponseEvent, nextCycle());
+        }
+        cacheBlocks[block_index].state = CacheState::BUSY;
+    }
+
+    if (pendingPullAddrs.find(cacheBlocks[block_index].addr) !=
+                                            pendingPullAddrs.end()) {
+        need_send_pkt = false;
+    }
+
+    if (need_send_pkt) {
+        PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
+                                        peerMemoryAtomSize);
+        ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_CACHE);
+        pkt->pushSenderState(purpose);
+        DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
+                "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
+        memPort.sendPacket(pkt);
+        onTheFlyReqs++;
+    }
+}
+
+void
+CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n",
+                                                __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+
+    if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].dirty);
+        assert(cacheBlocks[block_index].hasConflict);
+        assert(cacheBlocks[block_index].state == CacheState::PENDING_WB);
+
+        // NOTE: If the atom we're writing back is active, we have to
+        // stop tracking it in the cache and start tracking it in the memory.
+        bool atom_active_now = false;
+        bool atom_active_future = false;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+            atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+        }
+
+        PacketPtr pkt = createWritePacket(
+                cacheBlocks[block_index].addr, peerMemoryAtomSize,
+                (uint8_t*) cacheBlocks[block_index].items);
+        DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
+                        "Addr: %lu, size = %d.\n", __func__,
+                        pkt->getAddr(), pkt->getSize());
+        if (atom_active_future) {
+            futureActiveCacheBlocks.erase(block_index);
+        }
+        if (atom_active_now) {
+            currentActiveCacheBlocks.erase(block_index);
+            if (enoughSpace()) {
+                activeBuffer.emplace_back(pkt, curTick());
+            } else {
+                int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
+                if (atom_active_future) {
+                    int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
+                }
+                memPort.sendPacket(pkt);
+                onTheFlyReqs++;
+            }
+        } else {
+            if (atom_active_future) {
+                int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                stats.futureFrontierSize.sample(futureDirectory->workCount());
+                stats.futureBlockActiveCount.sample(count);
+            }
+            memPort.sendPacket(pkt);
+            onTheFlyReqs++;
+        }
+        cacheBlocks[block_index].reset();
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+    } else {
+        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a "
+                            "write back has been scheduled for it. Ignoring "
+                            "the current write back scheduled at tick %lu for "
+                            "the right function scheduled later.\n",
+                            __func__, block_index, schedule_tick);
+    }
+}
+
+void
+CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
+{
+    if (!postPushWBQueue.empty()) {
+        PacketPtr wb_pkt;
+        Tick pkt_tick;
+        std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
+        if (schedule_tick == pkt_tick) {
+            WorkListItem items[numElementsPerLine];
+            wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active_future |= items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureDirectory->activate(wb_pkt->getAddr());
+            }
+            memPort.sendPacket(wb_pkt);
+            onTheFlyReqs++;
+            postPushWBQueue.pop_front();
+        }
+    }
+}
+
+void
+CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
+{
+    pullsScheduled--;
+    if (!currentDirectory->empty()) {
+        Addr addr = currentDirectory->getNextWork();
+        int block_index = getBlockIndex(addr);
+
+        bool in_cache = cacheBlocks[block_index].addr == addr;
+        bool in_active_buffer = false;
+        for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) {
+            PacketPtr pkt = std::get<0>(*ab);
+            in_active_buffer |= (pkt->getAddr() == addr);
+        }
+        bool in_write_buffer = false;
+        for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
+        {
+            PacketPtr pkt = std::get<0>(*wb);
+            in_write_buffer |= (pkt->getAddr() == addr);
+        }
+        bool repeat_work = pendingPullAddrs.find(addr) != pendingPullAddrs.end();
+
+        if (!in_cache && !in_active_buffer && !in_write_buffer && !repeat_work) {
+            PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+            ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_PUSH);
+            pkt->pushSenderState(purpose);
+            memPort.sendPacket(pkt);
+            onTheFlyReqs++;
+            pendingPullReads++;
+            pendingPullAddrs.insert(addr);
+        }
+    }
+}
+
+void
+CoalesceEngine::recvMemRetry()
+{
+    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
+
+    if (!nextMemoryEvent.pending()) {
+        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
+        return;
+    }
+    assert(!nextMemoryEvent.scheduled());
+    nextMemoryEvent.wake();
+    schedule(nextMemoryEvent, nextCycle());
+}
+
+int
+CoalesceEngine::workCount()
+{
+    return currentActiveCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size();
+}
+
+void
+CoalesceEngine::recvVertexPull()
+{
+    pullsReceived++;
+    DPRINTF(CoalesceEngine, "%s: Received a vertex pull. pullsReceived: %d.\n", __func__, pullsReceived);
+
+    stats.verticesPulled++;
+    stats.lastVertexPullTime = curTick() - stats.lastResetTick;
+    if (!nextApplyEvent.scheduled()) {
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextApplyEvent()
+{
+    if ((!activeBuffer.empty()) &&
+        (postPushWBQueue.size() < postPushWBQueueSize)) {
+        PacketPtr pkt;
+        Tick entrance_tick;
+        WorkListItem items[numElementsPerLine];
+
+        std::tie(pkt, entrance_tick) = activeBuffer.front();
+        pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+
+        for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
+            if (items[index].activeNow) {
+                Addr addr = pkt->getAddr() + index * sizeof(WorkListItem);
+                uint32_t delta = graphWorkload->apply(items[index]);
+                items[index].activeNow = false;
+                owner->recvVertexPush(addr, delta, items[index].edgeIndex,
+                                                    items[index].degree);
+                pullsReceived--;
+                stats.verticesPushed++;
+                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+            }
+        }
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
+
+        bool atom_active_now = false;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            atom_active_now |= items[index].activeNow;
+        }
+        // NOTE: If the atom is not active anymore.
+        if (!atom_active_now) {
+            PacketPtr wb_pkt = createWritePacket(pkt->getAddr(),
+                                        peerMemoryAtomSize, (uint8_t*) items);
+            postPushWBQueue.emplace_back(wb_pkt, curTick());
+            activeBuffer.pop_front();
+            memoryFunctionQueue.emplace_back(
+                [this] (int ignore, Tick schedule_tick) {
+                    processNextPostPushWB(ignore, schedule_tick);
+                }, 0, curTick());
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+            delete pkt;
+        }
+    } else if (!currentActiveCacheBlocks.empty()) {
+        int num_visited_indices = 0;
+        int initial_fifo_length = currentActiveCacheBlocks.size();
+        while (true) {
+            int block_index = currentActiveCacheBlocks.front();
+            if (cacheBlocks[block_index].state == CacheState::IDLE) {
+                for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
+                    if (cacheBlocks[block_index].items[index].activeNow) {
+                        Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem);
+                        uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]);
+                        cacheBlocks[block_index].items[index].activeNow = false;
+                        cacheBlocks[block_index].dirty = true;
+                        owner->recvVertexPush(addr, delta,
+                            cacheBlocks[block_index].items[index].edgeIndex,
+                            cacheBlocks[block_index].items[index].degree);
+                        pullsReceived--;
+                        stats.verticesPushed++;
+                        stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+                    }
+                }
+
+                bool atom_active_now = false;
+                for (int index = 0; index < numElementsPerLine; index++) {
+                    atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                }
+                // NOTE: If we have reached the last item in the cache block
+                if (!atom_active_now) {
+                    currentActiveCacheBlocks.erase(block_index);
+                }
+                break;
+            }
+            // NOTE: If the block with index at the front of activeCacheBlocks
+            // is not in IDLE state, then roll the that index to the back
+            currentActiveCacheBlocks.pop_front();
+            currentActiveCacheBlocks.push_back(block_index);
+            // NOTE: If we have visited all the items initially in the FIFO.
+            num_visited_indices++;
+            if (num_visited_indices == initial_fifo_length) {
+                break;
+            }
+        }
+    } else {
+        DPRINTF(CoalesceEngine, "%s: Could not find work to apply.\n", __func__);
+        stats.worklessCycles++;
+    }
+
+    if (pullCondition()) {
+        memoryFunctionQueue.emplace_back(
+            [this] (int ignore, Tick schedule_tick) {
+                processNextVertexPull(ignore, schedule_tick);
+            }, 0, curTick());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
+        pullsScheduled++;
+    }
+
+    if ((pullsReceived > 0) && (!nextApplyEvent.scheduled())) {
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextDoneSignalEvent()
+{
+    if (done()) {
+        owner->recvDoneSignal();
+    }
+}
+
+CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
+    : statistics::Group(&_coalesce),
+    coalesce(_coalesce),
+    lastResetTick(0),
+    ADD_STAT(numVertexReads, statistics::units::Count::get(),
+             "Number of memory vertecies read from cache."),
+    ADD_STAT(numVertexWrites, statistics::units::Count::get(),
+             "Number of memory vertecies written to cache."),
+    ADD_STAT(readHits, statistics::units::Count::get(),
+             "Number of cache hits."),
+    ADD_STAT(readMisses, statistics::units::Count::get(),
+             "Number of cache misses."),
+    ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
+             "Number of cache hit under misses."),
+    ADD_STAT(numConflicts, statistics::units::Count::get(),
+             "Number of conflicts raised by reads in the cache."),
+    ADD_STAT(responsePortShortage, statistics::units::Count::get(),
+             "Number of times a response has been "
+             "delayed because of port shortage. "),
+    ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
+             "Number of times memory bandwidth was not available."),
+    ADD_STAT(wastefulBytesRead, statistics::units::Byte::get(),
+             "Number of bytes read that were not used by coalesce engine"),
+    ADD_STAT(verticesPulled, statistics::units::Count::get(),
+             "Number of times a pull request has been sent by PushEngine."),
+    ADD_STAT(verticesPushed, statistics::units::Count::get(),
+             "Number of times a vertex has been pushed to the PushEngine"),
+    ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(),
+             "Time of the last pull request. (Relative to reset_stats)"),
+    ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
+             "Time of the last vertex push. (Relative to reset_stats)"),
+    ADD_STAT(worklessCycles, statistics::units::Count::get(),
+             "cycles the coalesce engine could not find work for apply"),
+    ADD_STAT(hitRate, statistics::units::Ratio::get(),
+             "Hit rate in the cache."),
+    ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which pull requests arrive."),
+    ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which vertices are pushed."),
+    ADD_STAT(currentFrontierSize, statistics::units::Count::get(),
+             "Histogram of the length of the current bitvector."),
+    ADD_STAT(futureFrontierSize, statistics::units::Count::get(),
+             "Histogram of the length of the future bitvector."),
+    ADD_STAT(currentBlockActiveCount, statistics::units::Count::get(),
+             "Histogram of the popCount values in the current directory"),
+    ADD_STAT(futureBlockActiveCount, statistics::units::Count::get(),
+             "Histogram of the popCount values in the future directory"),
+    ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
+             "Histogram of the response latency to WLEngine. (ns)"),
+    ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
+             "Histogram of the latency of processing a memory function.")
+{
+}
+
+void
+CoalesceEngine::CoalesceStats::regStats()
+{
+    using namespace statistics;
+
+    hitRate = (readHits + readHitUnderMisses) /
+                (readHits + readHitUnderMisses + readMisses);
+
+    vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime;
+
+    vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
+
+    currentFrontierSize.init(64);
+    futureFrontierSize.init(64);
+    currentBlockActiveCount.init(64);
+    futureBlockActiveCount.init(64);
+    responseQueueLatency.init(64);
+    memoryFunctionLatency.init(64);
+}
+
+void
+CoalesceEngine::CoalesceStats::resetStats()
+{
+    statistics::Group::resetStats();
+
+    lastResetTick = curTick();
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index f661bd68a6..318ea0798b 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -28,6 +28,8 @@
 
 #include "accl/graph/sega/mpu.hh"
 
+#include <iostream>
+
 #include "accl/graph/sega/centeral_controller.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
@@ -37,11 +39,13 @@ namespace gem5
 {
 
 MPU::MPU(const Params& params):
-    SimObject(params),
+    ClockedObject(params),
     system(params.system),
     wlEngine(params.wl_engine),
     coalesceEngine(params.coalesce_engine),
-    pushEngine(params.push_engine)
+    pushEngine(params.push_engine),
+    sliceCounter(0),
+    nextSliceEvent([this] { processNextSliceEvent(); }, name())
 {
     wlEngine->registerMPU(this);
     coalesceEngine->registerMPU(this);
@@ -54,12 +58,56 @@ MPU::registerCenteralController(CenteralController* centeral_controller)
     centeralController = centeral_controller;
 }
 
+int
+MPU::getSliceSize()
+{
+    int slice_number = 
+        (coalesceEngine->getSliceSize() * centeralController->getnumGPTs());
+    
+    return slice_number;
+}
+
+bool
+MPU::bufferRemoteUpdate(int slice_number, PacketPtr pkt)
+{
+    return centeralController->bufferRemoteUpdate(slice_number, pkt);
+}
+
 bool
 MPU::handleIncomingUpdate(PacketPtr pkt)
 {
     return wlEngine->handleIncomingUpdate(pkt);
 }
 
+void
+MPU::scheduleNewSlice()
+{
+    if (!nextSliceEvent.scheduled()) {
+        schedule(nextSliceEvent, nextCycle());
+    }
+    return;
+}
+
+void
+MPU::processNextSliceEvent()
+{ 
+    auto new_update = 
+    centeralController->remoteUpdates[this][this->getSliceCounter()].front();
+    bool sent = wlEngine->handleIncomingUpdate(new_update);
+    
+    centeralController->remoteUpdates[this]
+                                        [this->getSliceCounter()].pop_front();
+    if (!sent) {
+        centeralController->remoteUpdates[this]
+                            [this->getSliceCounter()].push_back(new_update);
+    }
+
+    if (!centeralController->remoteUpdates[this][this->getSliceCounter()].empty() && !nextSliceEvent.scheduled()) {
+        schedule(nextSliceEvent, nextCycle());
+    }
+
+}
+
 void
 MPU::handleIncomingWL(Addr addr, WorkListItem wl)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 95d3adeca5..2008a7dc4f 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -39,7 +39,7 @@
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/addr_range.hh"
 #include "mem/packet.hh"
-#include "sim/sim_object.hh"
+#include "sim/clocked_object.hh"
 #include "sim/system.hh"
 #include "params/MPU.hh"
 
@@ -48,7 +48,7 @@ namespace gem5
 
 class CenteralController;
 
-class MPU : public SimObject
+class MPU : public ClockedObject
 {
   private:
     System* system;
@@ -57,7 +57,10 @@ class MPU : public SimObject
     WLEngine* wlEngine;
     CoalesceEngine* coalesceEngine;
     PushEngine* pushEngine;
+    int sliceCounter;
 
+    EventFunctionWrapper nextSliceEvent;
+    void processNextSliceEvent();
   public:
     PARAMS(MPU);
     MPU(const Params& params);
@@ -74,8 +77,15 @@ class MPU : public SimObject
     void postConsumeProcess() { coalesceEngine->postConsumeProcess(); }
     void swapDirectories() { coalesceEngine->swapDirectories(); }
 
-    bool handleIncomingUpdate(PacketPtr pkt);
+    int getSliceSize();
+    int getSliceCounter() { return sliceCounter; }
+    int increaseSliceCounter() { return sliceCounter++; }
+    void updateSliceCounter(int value) { sliceCounter = value;}
+    void resetSliceCounter() { sliceCounter = 0; }
+    bool bufferRemoteUpdate(int slice_number, PacketPtr pkt);
+    void scheduleNewSlice();
 
+    bool handleIncomingUpdate(PacketPtr pkt);
     void handleIncomingWL(Addr addr, WorkListItem wl);
     ReadReturnStatus recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index d563450179..b4649b6a9d 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -149,6 +149,19 @@ WLEngine::done()
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
+    int slice_number = (int)(pkt->getAddr()/(owner->getSliceSize()));
+    if (slice_number != owner->getSliceCounter()) {
+        DPRINTF(WLEngine, "%s: Packet %lu slice number is: %d. The current "
+                "slice number is: %d, The total number of vertices/slice: %d \n", 
+                __func__, pkt->getAddr(), slice_number, 
+                owner->getSliceCounter(), 
+                owner->getSliceSize()/sizeof(WorkListItem));
+        bool ret = owner->bufferRemoteUpdate(slice_number, pkt);
+        if (done() && !nextDoneSignalEvent.scheduled()) {
+            schedule(nextDoneSignalEvent, nextCycle());
+        }
+        return ret;
+    }
     assert((updateQueueSize == 0) || (updateQueue.size() <= updateQueueSize));
     if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
         return false;
@@ -173,6 +186,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     return true;
 }
 
+
 // TODO: Parameterize the number of pops WLEngine can do at a time.
 // TODO: Add a histogram stats of the size of the updateQueue. Sample here.
 void